From dba3d36b2f0842ed7f25c33cd3a2ccdb3d0df9db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jan 2009 17:10:12 +0100 Subject: [PATCH 01/22] Revert "generic, x86: fix __per_cpu_load relocation" This reverts commit 5a611268b69f05262936dd177205acbce4471358. It is causing occasional boot crashes, caused by certain linker versions (GNU ld version 2.18.50.0.6-2 20080403) messing up: 82dcc000 D __per_cpu_load c16e6000 A __per_cpu_load_abs The __per_cpu_load value is out of whack. Hpa noticed the following detail: * (gdb) p/x -(0xc16e6000-0x82dcc000) * $2 = 0xc16e6000 * I.e. one is the other << 1 The two symbols should be equal. Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f3180a85c66..53e21f36a80 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -451,18 +451,17 @@ * end offset. */ #define PERCPU_VADDR(vaddr, phdr) \ - VMLINUX_SYMBOL(__per_cpu_load_abs) = .; \ - .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load_abs) \ + VMLINUX_SYMBOL(__per_cpu_load) = .; \ + .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \ - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ - VMLINUX_SYMBOL(__per_cpu_load) = LOADADDR(.data.percpu) + LOAD_OFFSET;\ *(.data.percpu.first) \ *(.data.percpu.page_aligned) \ *(.data.percpu) \ *(.data.percpu.shared_aligned) \ VMLINUX_SYMBOL(__per_cpu_end) = .; \ } phdr \ - . = VMLINUX_SYMBOL(__per_cpu_load_abs) + SIZEOF(.data.percpu); + . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu); /** * PERCPU - define output section for percpu area, simple version From 3ac6cffea4aa18007a454a7442da2855882f403d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Jan 2009 16:32:22 +0900 Subject: [PATCH 02/22] linker script: use separate simpler definition for PERCPU() Impact: fix linker screwup on x86_32 Recent x86_64 zerobased patches introduced PERCPU_VADDR() to put .data.percpu to a predefined address and re-defined PERCPU() in terms of it. The new macro defined one extra symbol, __per_cpu_load, for LMA of the section so that the init data could be accessed. This new symbol introduced the following problems to x86_32. 1. If __per_cpu_load is defined outside of .data.percpu as an absolute symbol, relocation generation for relocatable kernel fails due to absolute relocation. 2. If __per_cpu_load is put inside .data.percpu with absolute address assignment to work around #1, linker gets confused and under certain configurations ends up relocating the symbol against .data.percpu such that the load address gets added on top of already set load address. As x86_32 doesn't use predefined address for .data.percpu, there's no need for it to care about the possibility of __per_cpu_load being different from __per_cpu_start. This patch defines PERCPU() separately so that __per_cpu_load is defined inside .data.percpu so that everything is ordinary linking-wise. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 53e21f36a80..5406e70aba8 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -445,10 +445,9 @@ * section in the linker script will go there too. @phdr should have * a leading colon. * - * This macro defines three symbols, __per_cpu_load, __per_cpu_start - * and __per_cpu_end. The first one is the vaddr of loaded percpu - * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the - * end offset. + * Note that this macros defines __per_cpu_load as an absolute symbol. + * If there is no need to put the percpu section at a predetermined + * address, use PERCPU(). */ #define PERCPU_VADDR(vaddr, phdr) \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ @@ -470,7 +469,20 @@ * Align to @align and outputs output section for percpu area. This * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and * __per_cpu_start will be identical. + * + * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except + * that __per_cpu_load is defined as a relative symbol against + * .data.percpu which is required for relocatable x86_32 + * configuration. */ #define PERCPU(align) \ . = ALIGN(align); \ - PERCPU_VADDR( , ) + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__per_cpu_load) = .; \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ + *(.data.percpu.first) \ + *(.data.percpu.page_aligned) \ + *(.data.percpu) \ + *(.data.percpu.shared_aligned) \ + VMLINUX_SYMBOL(__per_cpu_end) = .; \ + } From 319f3ba52c71630865b10ac3b99dd020440d681d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:01 -0800 Subject: [PATCH 03/22] xen: move remaining mmu-related stuff into mmu.c Impact: Cleanup Move remaining mmu-related stuff into mmu.c. A general cleanup, and lay the groundwork for later patches. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 731 +-------------------------------------- arch/x86/xen/mmu.c | 700 +++++++++++++++++++++++++++++++++++++ arch/x86/xen/mmu.h | 3 + arch/x86/xen/xen-ops.h | 8 + 4 files changed, 712 insertions(+), 730 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 6b3f7eef57e..0cd2a165f17 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -61,35 +61,6 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); enum xen_domain_type xen_domain_type = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); -/* - * Identity map, in addition to plain kernel map. This needs to be - * large enough to allocate page table pages to allocate the rest. - * Each page can map 2MB. - */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; - -#ifdef CONFIG_X86_64 -/* l3 pud for userspace vsyscall mapping */ -static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; -#endif /* CONFIG_X86_64 */ - -/* - * Note about cr3 (pagetable base) values: - * - * xen_cr3 contains the current logical cr3 value; it contains the - * last set cr3. This may not be the current effective cr3, because - * its update may be being lazily deferred. However, a vcpu looking - * at its own cr3 can use this value knowing that it everything will - * be self-consistent. - * - * xen_current_cr3 contains the actual vcpu cr3; it is set once the - * hypercall to set the vcpu cr3 is complete (so it may be a little - * out of date, but it will never be set early). If one vcpu is - * looking at another vcpu's cr3 value, it should use this variable. - */ -DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ - struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -237,7 +208,7 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static void xen_leave_lazy(void) +void xen_leave_lazy(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); @@ -598,76 +569,6 @@ static struct apic_ops xen_basic_apic_ops = { #endif -static void xen_flush_tlb(void) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_single(unsigned long addr) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = MMUEXT_INVLPG_LOCAL; - op->arg1.linear_addr = addr & PAGE_MASK; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long va) -{ - struct { - struct mmuext_op op; - DECLARE_BITMAP(mask, NR_CPUS); - } *args; - struct multicall_space mcs; - - BUG_ON(cpumask_empty(cpus)); - BUG_ON(!mm); - - mcs = xen_mc_entry(sizeof(*args)); - args = mcs.args; - args->op.arg2.vcpumask = to_cpumask(args->mask); - - /* Remove us, and any offline CPUS. */ - cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - if (unlikely(cpumask_empty(to_cpumask(args->mask)))) - goto issue; - - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { - args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; - } - - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - -issue: - xen_mc_issue(PARAVIRT_LAZY_MMU); -} static void xen_clts(void) { @@ -693,21 +594,6 @@ static void xen_write_cr0(unsigned long cr0) xen_mc_issue(PARAVIRT_LAZY_CPU); } -static void xen_write_cr2(unsigned long cr2) -{ - percpu_read(xen_vcpu)->arch.cr2 = cr2; -} - -static unsigned long xen_read_cr2(void) -{ - return percpu_read(xen_vcpu)->arch.cr2; -} - -static unsigned long xen_read_cr2_direct(void) -{ - return percpu_read(xen_vcpu_info.arch.cr2); -} - static void xen_write_cr4(unsigned long cr4) { cr4 &= ~X86_CR4_PGE; @@ -716,71 +602,6 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -static unsigned long xen_read_cr3(void) -{ - return percpu_read(xen_cr3); -} - -static void set_current_cr3(void *v) -{ - percpu_write(xen_current_cr3, (unsigned long)v); -} - -static void __xen_write_cr3(bool kernel, unsigned long cr3) -{ - struct mmuext_op *op; - struct multicall_space mcs; - unsigned long mfn; - - if (cr3) - mfn = pfn_to_mfn(PFN_DOWN(cr3)); - else - mfn = 0; - - WARN_ON(mfn == 0 && kernel); - - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; - op->arg1.mfn = mfn; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - if (kernel) { - percpu_write(xen_cr3, cr3); - - /* Update xen_current_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); - } -} - -static void xen_write_cr3(unsigned long cr3) -{ - BUG_ON(preemptible()); - - xen_mc_batch(); /* disables interrupts */ - - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - percpu_write(xen_cr3, cr3); - - __xen_write_cr3(true, cr3); - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); - if (user_pgd) - __xen_write_cr3(false, __pa(user_pgd)); - else - __xen_write_cr3(false, 0); - } -#endif - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} - static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; @@ -822,185 +643,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } -/* Early in boot, while setting up the initial pagetable, assume - everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) -{ -#ifdef CONFIG_FLATMEM - BUG_ON(mem_map); /* should only be used early */ -#endif - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} - -/* Early release_pte assumes that all pts are pinned, since there's - only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(unsigned long pfn) -{ - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); -} - -static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - -/* This needs to make sure the new pte page is pinned iff its being - attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(virt_to_page(mm->pgd))) { - SetPagePinned(page); - - vm_unmap_aliases(); - if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); - if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - } else { - /* make sure there are no stray mappings of - this page */ - kmap_flush_unused(); - } - } -} - -static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PTE); -} - -static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PMD); -} - -static int xen_pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = mm->pgd; - int ret = 0; - - BUG_ON(PagePinned(virt_to_page(pgd))); - -#ifdef CONFIG_X86_64 - { - struct page *page = virt_to_page(pgd); - pgd_t *user_pgd; - - BUG_ON(page->private != 0); - - ret = -ENOMEM; - - user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - page->private = (unsigned long)user_pgd; - - if (user_pgd != NULL) { - user_pgd[pgd_index(VSYSCALL_START)] = - __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); - ret = 0; - } - - BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); - } -#endif - - return ret; -} - -static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - if (user_pgd) - free_page((unsigned long)user_pgd); -#endif -} - -/* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(page)) { - if (!PageHighMem(page)) { - if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); - } - ClearPagePinned(page); - } -} - -static void xen_release_pte(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PTE); -} - -static void xen_release_pmd(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PMD); -} - -#if PAGETABLE_LEVELS == 4 -static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PUD); -} - -static void xen_release_pud(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PUD); -} -#endif - -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - - return kmap_atomic_prot(page, type, prot); -} -#endif - -#ifdef CONFIG_X86_32 -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) -{ - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); - - return pte; -} - -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) -{ - pte = mask_rw_pte(ptep, pte); - - xen_set_pte(ptep, pte); -} -#endif - -static __init void xen_pagetable_setup_start(pgd_t *base) -{ -} - void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1021,37 +663,6 @@ void xen_setup_shared_info(void) xen_setup_mfn_list_list(); } -static __init void xen_pagetable_setup_done(pgd_t *base) -{ - xen_setup_shared_info(); -} - -static __init void xen_post_allocator_init(void) -{ - pv_mmu_ops.set_pte = xen_set_pte; - pv_mmu_ops.set_pmd = xen_set_pmd; - pv_mmu_ops.set_pud = xen_set_pud; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = xen_set_pgd; -#endif - - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; -#endif - -#ifdef CONFIG_X86_64 - SetPagePinned(virt_to_page(level3_user_vsyscall)); -#endif - xen_mark_init_mm_pinned(); -} - /* This is called once we have the cpu_possible_map */ void xen_setup_vcpu_info_placement(void) { @@ -1126,49 +737,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) -{ - pte_t pte; - - phys >>= PAGE_SHIFT; - - switch (idx) { - case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: -#ifdef CONFIG_X86_F00F_BUG - case FIX_F00F_IDT: -#endif -#ifdef CONFIG_X86_32 - case FIX_WP_TEST: - case FIX_VDSO: -# ifdef CONFIG_HIGHMEM - case FIX_KMAP_BEGIN ... FIX_KMAP_END: -# endif -#else - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: -#endif -#ifdef CONFIG_X86_LOCAL_APIC - case FIX_APIC_BASE: /* maps dummy local APIC */ -#endif - pte = pfn_pte(phys, prot); - break; - - default: - pte = mfn_pte(phys, prot); - break; - } - - __native_set_fixmap(idx, pte); - -#ifdef CONFIG_X86_64 - /* Replicate changes to map the vsyscall page into the user - pagetable vsyscall mapping. */ - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { - unsigned long vaddr = __fix_to_virt(idx); - set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); - } -#endif -} - static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, @@ -1264,86 +832,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = { #endif }; -static const struct pv_mmu_ops xen_mmu_ops __initdata = { - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - - .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, - - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, - - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pte_init, - .alloc_pmd_clone = paravirt_nop, - .release_pmd = xen_release_pte_init, - -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - -#ifdef CONFIG_X86_64 - .set_pte = xen_set_pte, -#else - .set_pte = xen_set_pte_init, -#endif - .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd_hyper, - - .ptep_modify_prot_start = __ptep_modify_prot_start, - .ptep_modify_prot_commit = __ptep_modify_prot_commit, - - .pte_val = xen_pte_val, - .pgd_val = xen_pgd_val, - - .make_pte = xen_make_pte, - .make_pgd = xen_make_pgd, - -#ifdef CONFIG_X86_PAE - .set_pte_atomic = xen_set_pte_atomic, - .set_pte_present = xen_set_pte_at, - .pte_clear = xen_pte_clear, - .pmd_clear = xen_pmd_clear, -#endif /* CONFIG_X86_PAE */ - .set_pud = xen_set_pud_hyper, - - .make_pmd = xen_make_pmd, - .pmd_val = xen_pmd_val, - -#if PAGETABLE_LEVELS == 4 - .pud_val = xen_pud_val, - .make_pud = xen_make_pud, - .set_pgd = xen_set_pgd_hyper, - - .alloc_pud = xen_alloc_pte_init, - .release_pud = xen_release_pte_init, -#endif /* PAGETABLE_LEVELS == 4 */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, - - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, - }, - - .set_fixmap = xen_set_fixmap, -}; - static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -1386,223 +874,6 @@ static const struct machine_ops __initdata xen_machine_ops = { }; -static void __init xen_reserve_top(void) -{ -#ifdef CONFIG_X86_32 - unsigned long top = HYPERVISOR_VIRT_START; - struct xen_platform_parameters pp; - - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) - top = pp.virt_start; - - reserve_top_address(-top); -#endif /* CONFIG_X86_32 */ -} - -/* - * Like __va(), but returns address in the kernel mapping (which is - * all we have until the physical memory mapping has been set up. - */ -static void *__ka(phys_addr_t paddr) -{ -#ifdef CONFIG_X86_64 - return (void *)(paddr + __START_KERNEL_map); -#else - return __va(paddr); -#endif -} - -/* Convert a machine address to physical address */ -static unsigned long m2p(phys_addr_t maddr) -{ - phys_addr_t paddr; - - maddr &= PTE_PFN_MASK; - paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; - - return paddr; -} - -/* Convert a machine address to kernel virtual */ -static void *m2v(phys_addr_t maddr) -{ - return __ka(m2p(maddr)); -} - -static void set_page_prot(void *addr, pgprot_t prot) -{ - unsigned long pfn = __pa(addr) >> PAGE_SHIFT; - pte_t pte = pfn_pte(pfn, prot); - - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) - BUG(); -} - -static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) -{ - unsigned pmdidx, pteidx; - unsigned ident_pte; - unsigned long pfn; - - ident_pte = 0; - pfn = 0; - for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { - pte_t *pte_page; - - /* Reuse or allocate a page of ptes */ - if (pmd_present(pmd[pmdidx])) - pte_page = m2v(pmd[pmdidx].pmd); - else { - /* Check for free pte pages */ - if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) - break; - - pte_page = &level1_ident_pgt[ident_pte]; - ident_pte += PTRS_PER_PTE; - - pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); - } - - /* Install mappings */ - for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { - pte_t pte; - - if (pfn > max_pfn_mapped) - max_pfn_mapped = pfn; - - if (!pte_none(pte_page[pteidx])) - continue; - - pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); - pte_page[pteidx] = pte; - } - } - - for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) - set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); - - set_page_prot(pmd, PAGE_KERNEL_RO); -} - -#ifdef CONFIG_X86_64 -static void convert_pfn_mfn(void *v) -{ - pte_t *pte = v; - int i; - - /* All levels are converted the same way, so just treat them - as ptes. */ - for (i = 0; i < PTRS_PER_PTE; i++) - pte[i] = xen_make_pte(pte[i].pte); -} - -/* - * Set up the inital kernel pagetable. - * - * We can construct this by grafting the Xen provided pagetable into - * head_64.S's preconstructed pagetables. We copy the Xen L2's into - * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This - * means that only the kernel has a physical mapping to start with - - * but that's enough to get __va working. We need to fill in the rest - * of the physical mapping once some sort of allocator has been set - * up. - */ -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) -{ - pud_t *l3; - pmd_t *l2; - - /* Zap identity mapping */ - init_level4_pgt[0] = __pgd(0); - - /* Pre-constructed entries are in pfn, so convert to mfn */ - convert_pfn_mfn(init_level4_pgt); - convert_pfn_mfn(level3_ident_pgt); - convert_pfn_mfn(level3_kernel_pgt); - - l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); - - memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - /* Set up identity map */ - xen_map_identity_early(level2_ident_pgt, max_pfn); - - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - /* Switch over */ - pgd = init_level4_pgt; - - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); - xen_mc_issue(PARAVIRT_LAZY_CPU); - - reserve_early(__pa(xen_start_info->pt_base), - __pa(xen_start_info->pt_base + - xen_start_info->nr_pt_frames * PAGE_SIZE), - "XEN PAGETABLES"); - - return pgd; -} -#else /* !CONFIG_X86_64 */ -static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; - -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) -{ - pmd_t *kernel_pmd; - - init_pg_tables_start = __pa(pgd); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); - - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - - xen_map_identity_early(level2_kernel_pgt, max_pfn); - - memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); - set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], - __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); - - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); - set_page_prot(empty_zero_page, PAGE_KERNEL_RO); - - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - xen_write_cr3(__pa(swapper_pg_dir)); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); - - return swapper_pg_dir; -} -#endif /* CONFIG_X86_64 */ - /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 98cb9869eb2..94e452c0b00 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,8 @@ #include #include +#include +#include #include "multicalls.h" #include "mmu.h" @@ -114,6 +117,37 @@ static inline void check_zero(void) #endif /* CONFIG_XEN_DEBUG_FS */ + +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ + + /* * Just beyond the highest usermode address. STACK_TOP_MAX has a * redzone above it, so round it up to a PGD boundary. @@ -1152,6 +1186,672 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } +static __init void xen_pagetable_setup_start(pgd_t *base) +{ +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + xen_setup_shared_info(); +} + +static void xen_write_cr2(unsigned long cr2) +{ + percpu_read(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return percpu_read(xen_vcpu)->arch.cr2; +} + +unsigned long xen_read_cr2_direct(void) +{ + return percpu_read(xen_vcpu_info.arch.cr2); +} + +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long va) +{ + struct { + struct mmuext_op op; + DECLARE_BITMAP(mask, NR_CPUS); + } *args; + struct multicall_space mcs; + + BUG_ON(cpumask_empty(cpus)); + BUG_ON(!mm); + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->op.arg2.vcpumask = to_cpumask(args->mask); + + /* Remove us, and any offline CPUS. */ + cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); + if (unlikely(cpumask_empty(to_cpumask(args->mask)))) + goto issue; + + if (va == TLB_FLUSH_ALL) { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + } else { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = va; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + +issue: + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static unsigned long xen_read_cr3(void) +{ + return percpu_read(xen_cr3); +} + +static void set_current_cr3(void *v) +{ + percpu_write(xen_current_cr3, (unsigned long)v); +} + +static void __xen_write_cr3(bool kernel, unsigned long cr3) +{ + struct mmuext_op *op; + struct multicall_space mcs; + unsigned long mfn; + + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; + + WARN_ON(mfn == 0 && kernel); + + mcs = __xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (kernel) { + percpu_write(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + percpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} + +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_START)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + + +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* Early release_pte assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void xen_release_pte_init(unsigned long pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(virt_to_page(mm->pgd))) { + SetPagePinned(page); + + vm_unmap_aliases(); + if (!PageHighMem(page)) { + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + } else { + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } + } +} + +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PTE); +} + +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PMD); +} + +/* This should never happen until we're OK to use struct page */ +static void xen_release_ptpage(unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(page)) { + if (!PageHighMem(page)) { + if (level == PT_PTE && USE_SPLIT_PTLOCKS) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } + ClearPagePinned(page); + } +} + +static void xen_release_pte(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pmd(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + +void __init xen_reserve_top(void) +{ +#ifdef CONFIG_X86_32 + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_PFN_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +static void set_page_prot(void *addr, pgprot_t prot) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + BUG(); +} + +static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + ident_pte = 0; + pfn = 0; + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for (i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} + +/* + * Set up the inital kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + */ +__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + /* Set up identity map */ + xen_map_identity_early(level2_ident_pgt, max_pfn); + + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* Switch over */ + pgd = init_level4_pgt; + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + + reserve_early(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); + + return pgd; +} +#else /* !CONFIG_X86_64 */ +static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; + +__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + init_pg_tables_start = __pa(pgd); + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + xen_write_cr3(__pa(swapper_pg_dir)); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + + return swapper_pg_dir; +} +#endif /* CONFIG_X86_64 */ + +static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +#ifdef CONFIG_X86_F00F_BUG + case FIX_F00F_IDT: +#endif +#ifdef CONFIG_X86_32 + case FIX_WP_TEST: + case FIX_VDSO: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#else + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +#endif +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ +#endif + pte = pfn_pte(phys, prot); + break; + + default: + pte = mfn_pte(phys, prot); + break; + } + + __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif +} + +__init void xen_post_allocator_init(void) +{ + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif + + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif + xen_mark_init_mm_pinned(); +} + + +const struct pv_mmu_ops xen_mmu_ops __initdata = { + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, + + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd_clone = paravirt_nop, + .release_pmd = xen_release_pte_init, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = xen_kmap_atomic_pte, +#endif + +#ifdef CONFIG_X86_64 + .set_pte = xen_set_pte, +#else + .set_pte = xen_set_pte_init, +#endif + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd_hyper, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .pte_val = xen_pte_val, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pgd = xen_make_pgd, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .set_pte_present = xen_set_pte_at, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, + + .make_pmd = xen_make_pmd, + .pmd_val = xen_pmd_val, + +#if PAGETABLE_LEVELS == 4 + .pud_val = xen_pud_val, + .make_pud = xen_make_pud, + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pte_init, + .release_pud = xen_release_pte_init, +#endif /* PAGETABLE_LEVELS == 4 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy, + }, + + .set_fixmap = xen_set_fixmap, +}; + + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 98d71659da5..24d1b44a337 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +unsigned long xen_read_cr2_direct(void); + +extern const struct pv_mmu_ops xen_mmu_ops; #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index c1f8faf0a2c..11913fc94c1 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -13,6 +13,7 @@ extern const char xen_failsafe_callback[]; struct trap_info; void xen_copy_trap_info(struct trap_info *traps); +DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); @@ -22,6 +23,13 @@ extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); +void xen_setup_machphys_mapping(void); +pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); +void xen_ident_map_ISA(void); +void xen_reserve_top(void); + +void xen_leave_lazy(void); +void xen_post_allocator_init(void); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); From 41edafdb78feac1d1f8823846209975fde990633 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:02 -0800 Subject: [PATCH 04/22] x86/pvops: add a paravirt_ident functions to allow special patching Impact: Optimization Several paravirt ops implementations simply return their arguments, the most obvious being the make_pte/pte_val class of operations on native. On 32-bit, the identity function is literally a no-op, as the calling convention uses the same registers for the first argument and return. On 64-bit, it can be implemented with a single "mov". This patch adds special identity functions for 32 and 64 bit argument, and machinery to recognize them and replace them with either nops or a mov as appropriate. At the moment, the only users for the identity functions are the pagetable entry conversion functions. The result is a measureable improvement on pagetable-heavy benchmarks (2-3%, reducing the pvops overhead from 5 to 2%). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 5 ++ arch/x86/kernel/paravirt.c | 75 +++++++++++++++++++++++++---- arch/x86/kernel/paravirt_patch_32.c | 12 +++++ arch/x86/kernel/paravirt_patch_64.c | 15 ++++++ 4 files changed, 98 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 17577888709..961d10c12f1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -388,6 +388,8 @@ extern struct pv_lock_ops pv_lock_ops; asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); unsigned paravirt_patch_ignore(unsigned len); unsigned paravirt_patch_call(void *insnbuf, const void *target, u16 tgt_clobbers, @@ -1371,6 +1373,9 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, } void _paravirt_nop(void); +u32 _paravirt_ident_32(u32); +u64 _paravirt_ident_64(u64); + #define paravirt_nop ((void *)_paravirt_nop) void paravirt_use_bytelocks(void); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 202514be592..dd25e2b1593 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -44,6 +44,17 @@ void _paravirt_nop(void) { } +/* identity function, which can be inlined */ +u32 _paravirt_ident_32(u32 x) +{ + return x; +} + +u64 _paravirt_ident_64(u64 x) +{ + return x; +} + static void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, if (opfunc == NULL) /* If there's no function, patch it with a ud2a (BUG) */ ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); - else if (opfunc == paravirt_nop) + else if (opfunc == _paravirt_nop) /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); + + /* identity functions just return their single argument */ + else if (opfunc == _paravirt_ident_32) + ret = paravirt_patch_ident_32(insnbuf, len); + else if (opfunc == _paravirt_ident_64) + ret = paravirt_patch_ident_64(insnbuf, len); + else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || @@ -373,6 +391,45 @@ struct pv_apic_ops pv_apic_ops = { #endif }; +typedef pte_t make_pte_t(pteval_t); +typedef pmd_t make_pmd_t(pmdval_t); +typedef pud_t make_pud_t(pudval_t); +typedef pgd_t make_pgd_t(pgdval_t); + +typedef pteval_t pte_val_t(pte_t); +typedef pmdval_t pmd_val_t(pmd_t); +typedef pudval_t pud_val_t(pud_t); +typedef pgdval_t pgd_val_t(pgd_t); + + +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) +/* 32-bit pagetable entries */ +#define paravirt_native_make_pte (make_pte_t *)_paravirt_ident_32 +#define paravirt_native_pte_val (pte_val_t *)_paravirt_ident_32 + +#define paravirt_native_make_pmd (make_pmd_t *)_paravirt_ident_32 +#define paravirt_native_pmd_val (pmd_val_t *)_paravirt_ident_32 + +#define paravirt_native_make_pud (make_pud_t *)_paravirt_ident_32 +#define paravirt_native_pud_val (pud_val_t *)_paravirt_ident_32 + +#define paravirt_native_make_pgd (make_pgd_t *)_paravirt_ident_32 +#define paravirt_native_pgd_val (pgd_val_t *)_paravirt_ident_32 +#else +/* 64-bit pagetable entries */ +#define paravirt_native_make_pte (make_pte_t *)_paravirt_ident_64 +#define paravirt_native_pte_val (pte_val_t *)_paravirt_ident_64 + +#define paravirt_native_make_pmd (make_pmd_t *)_paravirt_ident_64 +#define paravirt_native_pmd_val (pmd_val_t *)_paravirt_ident_64 + +#define paravirt_native_make_pud (make_pud_t *)_paravirt_ident_64 +#define paravirt_native_pud_val (pud_val_t *)_paravirt_ident_64 + +#define paravirt_native_make_pgd (make_pgd_t *)_paravirt_ident_64 +#define paravirt_native_pgd_val (pgd_val_t *)_paravirt_ident_64 +#endif + struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, @@ -424,21 +481,21 @@ struct pv_mmu_ops pv_mmu_ops = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, - .pmd_val = native_pmd_val, - .make_pmd = native_make_pmd, + .pmd_val = paravirt_native_pmd_val, + .make_pmd = paravirt_native_make_pmd, #if PAGETABLE_LEVELS == 4 - .pud_val = native_pud_val, - .make_pud = native_make_pud, + .pud_val = paravirt_native_pud_val, + .make_pud = paravirt_native_make_pud, .set_pgd = native_set_pgd, #endif #endif /* PAGETABLE_LEVELS >= 3 */ - .pte_val = native_pte_val, - .pgd_val = native_pgd_val, + .pte_val = paravirt_native_pte_val, + .pgd_val = paravirt_native_pgd_val, - .make_pte = native_make_pte, - .make_pgd = native_make_pgd, + .make_pte = paravirt_native_make_pte, + .make_pgd = paravirt_native_make_pgd, .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 9fe644f4861..d9f32e6d6ab 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) +{ + /* arg in %eax, return in %eax */ + return 0; +} + +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) +{ + /* arg in %edx:%eax, return in %edx:%eax */ + return 0; +} + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 061d01df9ae..3f08f34f93e 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); +DEF_NATIVE(, mov32, "mov %edi, %eax"); +DEF_NATIVE(, mov64, "mov %rdi, %rax"); + +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) +{ + return paravirt_patch_insns(insnbuf, len, + start__mov32, end__mov32); +} + +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) +{ + return paravirt_patch_insns(insnbuf, len, + start__mov64, end__mov64); +} + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { From b8aa287f77be943e37a84fa4657e27df95269bfb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:03 -0800 Subject: [PATCH 05/22] x86: fix paravirt clobber in entry_64.S Impact: Fix latent bug The clobber is trying to say that anything except RDI is available for clobbering, but actually clobbers everything. This hasn't mattered because the clobbers were basically ignored, but subsequent patches will rely on them. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a52703864a1..e4c9710cae5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1140,7 +1140,7 @@ ENTRY(native_load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 - DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) + DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: movl %edi,%gs From 9104a18dcdd8dfefdddca8ce44988563f13ed3c4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:04 -0800 Subject: [PATCH 06/22] x86/paravirt: selectively save/restore regs around pvops calls Impact: Optimization Each asm paravirt-ops call says what registers are available for clobbering. This patch makes use of this to selectively save/restore registers around each pvops call. In many cases this significantly shrinks code size. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 100 +++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 961d10c12f1..dcce961262b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -12,19 +12,29 @@ #define CLBR_EAX (1 << 0) #define CLBR_ECX (1 << 1) #define CLBR_EDX (1 << 2) +#define CLBR_EDI (1 << 3) -#ifdef CONFIG_X86_64 -#define CLBR_RSI (1 << 3) -#define CLBR_RDI (1 << 4) +#ifdef CONFIG_X86_32 +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 4) - 1) +#else +#define CLBR_RAX CLBR_EAX +#define CLBR_RCX CLBR_ECX +#define CLBR_RDX CLBR_EDX +#define CLBR_RDI CLBR_EDI +#define CLBR_RSI (1 << 4) #define CLBR_R8 (1 << 5) #define CLBR_R9 (1 << 6) #define CLBR_R10 (1 << 7) #define CLBR_R11 (1 << 8) #define CLBR_ANY ((1 << 9) - 1) + +#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ + CLBR_RCX | CLBR_R8 | CLBR_R9) +#define CLBR_RET_REG (CLBR_RAX | CLBR_RDX) +#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) + #include -#else -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 3) - 1) #endif /* X86_64 */ #ifndef __ASSEMBLY__ @@ -1530,33 +1540,49 @@ static inline unsigned long __raw_local_irq_save(void) .popsection +#define COND_PUSH(set, mask, reg) \ + .if ((~set) & mask); push %reg; .endif +#define COND_POP(set, mask, reg) \ + .if ((~set) & mask); pop %reg; .endif + #ifdef CONFIG_X86_64 -#define PV_SAVE_REGS \ - push %rax; \ - push %rcx; \ - push %rdx; \ - push %rsi; \ - push %rdi; \ - push %r8; \ - push %r9; \ - push %r10; \ - push %r11 -#define PV_RESTORE_REGS \ - pop %r11; \ - pop %r10; \ - pop %r9; \ - pop %r8; \ - pop %rdi; \ - pop %rsi; \ - pop %rdx; \ - pop %rcx; \ - pop %rax + +#define PV_SAVE_REGS(set) \ + COND_PUSH(set, CLBR_RAX, rax); \ + COND_PUSH(set, CLBR_RCX, rcx); \ + COND_PUSH(set, CLBR_RDX, rdx); \ + COND_PUSH(set, CLBR_RSI, rsi); \ + COND_PUSH(set, CLBR_RDI, rdi); \ + COND_PUSH(set, CLBR_R8, r8); \ + COND_PUSH(set, CLBR_R9, r9); \ + COND_PUSH(set, CLBR_R10, r10); \ + COND_PUSH(set, CLBR_R11, r11) +#define PV_RESTORE_REGS(set) \ + COND_POP(set, CLBR_R11, r11); \ + COND_POP(set, CLBR_R10, r10); \ + COND_POP(set, CLBR_R9, r9); \ + COND_POP(set, CLBR_R8, r8); \ + COND_POP(set, CLBR_RDI, rdi); \ + COND_POP(set, CLBR_RSI, rsi); \ + COND_POP(set, CLBR_RDX, rdx); \ + COND_POP(set, CLBR_RCX, rcx); \ + COND_POP(set, CLBR_RAX, rax) + #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) #else -#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx -#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax +#define PV_SAVE_REGS(set) \ + COND_PUSH(set, CLBR_EAX, eax); \ + COND_PUSH(set, CLBR_EDI, edi); \ + COND_PUSH(set, CLBR_ECX, ecx); \ + COND_PUSH(set, CLBR_EDX, edx) +#define PV_RESTORE_REGS(set) \ + COND_POP(set, CLBR_EDX, edx); \ + COND_POP(set, CLBR_ECX, ecx); \ + COND_POP(set, CLBR_EDI, edi); \ + COND_POP(set, CLBR_EAX, eax) + #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) #define PARA_INDIRECT(addr) *%cs:addr @@ -1568,15 +1594,15 @@ static inline unsigned long __raw_local_irq_save(void) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ - PV_SAVE_REGS; \ + PV_SAVE_REGS(clobbers); \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ - PV_RESTORE_REGS;) \ + PV_RESTORE_REGS(clobbers);) #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ - PV_SAVE_REGS; \ + PV_SAVE_REGS(clobbers); \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ - PV_RESTORE_REGS;) + PV_RESTORE_REGS(clobbers);) #define USERGS_SYSRET32 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ @@ -1606,11 +1632,15 @@ static inline unsigned long __raw_local_irq_save(void) PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ swapgs) +/* + * Note: swapgs is very special, and in practise is either going to be + * implemented with a single "swapgs" instruction or something very + * special. Either way, we don't need to save any registers for + * it. + */ #define SWAPGS \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ - PV_SAVE_REGS; \ - call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ - PV_RESTORE_REGS \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \ ) #define GET_CR2_INTO_RCX \ From ecb93d1ccd0aac63f03be2db3cac3fa974716f4c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:05 -0800 Subject: [PATCH 07/22] x86/paravirt: add register-saving thunks to reduce caller register pressure Impact: Optimization One of the problems with inserting a pile of C calls where previously there were none is that the register pressure is greatly increased. The C calling convention says that the caller must expect a certain set of registers may be trashed by the callee, and that the callee can use those registers without restriction. This includes the function argument registers, and several others. This patch seeks to alleviate this pressure by introducing wrapper thunks that will do the register saving/restoring, so that the callsite doesn't need to worry about it, but the callee function can be conventional compiler-generated code. In many cases (particularly performance-sensitive cases) the callee will be in assembler anyway, and need not use the compiler's calling convention. Standard calling convention is: arguments return scratch x86-32 eax edx ecx eax ? x86-64 rdi rsi rdx rcx rax r8 r9 r10 r11 The thunk preserves all argument and scratch registers. The return register is not preserved, and is available as a scratch register for unwrapped callee code (and of course the return value). Wrapped function pointers are themselves wrapped in a struct paravirt_callee_save structure, in order to get some warning from the compiler when functions with mismatched calling conventions are used. The most common paravirt ops, both statically and dynamically, are interrupt enable/disable/save/restore, so handle them first. This is particularly easy since their calls are handled specially anyway. XXX Deal with VMI. What's their calling convention? Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 126 ++++++++++++++++++++++++-------- arch/x86/kernel/paravirt.c | 8 +- arch/x86/kernel/vsmp_64.c | 12 ++- arch/x86/lguest/boot.c | 13 +++- arch/x86/xen/enlighten.c | 8 +- arch/x86/xen/irq.c | 14 +++- 6 files changed, 132 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dcce961262b..f9107b88631 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -17,6 +17,10 @@ #ifdef CONFIG_X86_32 /* CLBR_ANY should match all regs platform has. For i386, that's just it */ #define CLBR_ANY ((1 << 4) - 1) + +#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) +#define CLBR_RET_REG (CLBR_EAX) +#define CLBR_SCRATCH (0) #else #define CLBR_RAX CLBR_EAX #define CLBR_RCX CLBR_ECX @@ -27,16 +31,19 @@ #define CLBR_R9 (1 << 6) #define CLBR_R10 (1 << 7) #define CLBR_R11 (1 << 8) + #define CLBR_ANY ((1 << 9) - 1) #define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX | CLBR_RDX) +#define CLBR_RET_REG (CLBR_RAX) #define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) #include #endif /* X86_64 */ +#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) + #ifndef __ASSEMBLY__ #include #include @@ -50,6 +57,14 @@ struct tss_struct; struct mm_struct; struct desc_struct; +/* + * Wrapper type for pointers to code which uses the non-standard + * calling convention. See PV_CALL_SAVE_REGS_THUNK below. + */ +struct paravirt_callee_save { + void *func; +}; + /* general info */ struct pv_info { unsigned int kernel_rpl; @@ -199,11 +214,15 @@ struct pv_irq_ops { * expected to use X86_EFLAGS_IF; all other bits * returned from save_fl are undefined, and may be ignored by * restore_fl. + * + * NOTE: These functions callers expect the callee to preserve + * more registers than the standard C calling convention. */ - unsigned long (*save_fl)(void); - void (*restore_fl)(unsigned long); - void (*irq_disable)(void); - void (*irq_enable)(void); + struct paravirt_callee_save save_fl; + struct paravirt_callee_save restore_fl; + struct paravirt_callee_save irq_disable; + struct paravirt_callee_save irq_enable; + void (*safe_halt)(void); void (*halt)(void); @@ -1437,12 +1456,37 @@ extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; #ifdef CONFIG_X86_32 -#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" -#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" +#define PV_SAVE_REGS "pushl %ecx; pushl %edx;" +#define PV_RESTORE_REGS "popl %edx; popl %ecx;" + +/* save and restore all caller-save registers, except return value */ +#define PV_SAVE_ALL_CALLER_REGS PV_SAVE_REGS +#define PV_RESTORE_ALL_CALLER_REGS PV_RESTORE_REGS + #define PV_FLAGS_ARG "0" #define PV_EXTRA_CLOBBERS #define PV_VEXTRA_CLOBBERS #else +/* save and restore all caller-save registers, except return value */ +#define PV_SAVE_ALL_CALLER_REGS \ + "push %rcx;" \ + "push %rdx;" \ + "push %rsi;" \ + "push %rdi;" \ + "push %r8;" \ + "push %r9;" \ + "push %r10;" \ + "push %r11;" +#define PV_RESTORE_ALL_CALLER_REGS \ + "pop %r11;" \ + "pop %r10;" \ + "pop %r9;" \ + "pop %r8;" \ + "pop %rdi;" \ + "pop %rsi;" \ + "pop %rdx;" \ + "pop %rcx;" + /* We save some registers, but all of them, that's too much. We clobber all * caller saved registers but the argument parameter */ #define PV_SAVE_REGS "pushq %%rdi;" @@ -1452,52 +1496,76 @@ extern struct paravirt_patch_site __parainstructions[], #define PV_FLAGS_ARG "D" #endif +/* + * Generate a thunk around a function which saves all caller-save + * registers except for the return value. This allows C functions to + * be called from assembler code where fewer than normal registers are + * available. It may also help code generation around calls from C + * code if the common case doesn't use many registers. + * + * When a callee is wrapped in a thunk, the caller can assume that all + * arg regs and all scratch registers are preserved across the + * call. The return value in rax/eax will not be saved, even for void + * functions. + */ +#define PV_CALLEE_SAVE_REGS_THUNK(func) \ + extern typeof(func) __raw_callee_save_##func; \ + static void *__##func##__ __used = func; \ + \ + asm(".pushsection .text;" \ + "__raw_callee_save_" #func ": " \ + PV_SAVE_ALL_CALLER_REGS \ + "call " #func ";" \ + PV_RESTORE_ALL_CALLER_REGS \ + "ret;" \ + ".popsection") + +/* Get a reference to a callee-save function */ +#define PV_CALLEE_SAVE(func) \ + ((struct paravirt_callee_save) { __raw_callee_save_##func }) + +/* Promise that "func" already uses the right calling convention */ +#define __PV_IS_CALLEE_SAVE(func) \ + ((struct paravirt_callee_save) { func }) + static inline unsigned long __raw_local_save_flags(void) { unsigned long f; - asm volatile(paravirt_alt(PV_SAVE_REGS - PARAVIRT_CALL - PV_RESTORE_REGS) + asm volatile(paravirt_alt(PARAVIRT_CALL) : "=a"(f) : paravirt_type(pv_irq_ops.save_fl), paravirt_clobber(CLBR_EAX) - : "memory", "cc" PV_VEXTRA_CLOBBERS); + : "memory", "cc"); return f; } static inline void raw_local_irq_restore(unsigned long f) { - asm volatile(paravirt_alt(PV_SAVE_REGS - PARAVIRT_CALL - PV_RESTORE_REGS) + asm volatile(paravirt_alt(PARAVIRT_CALL) : "=a"(f) : PV_FLAGS_ARG(f), paravirt_type(pv_irq_ops.restore_fl), paravirt_clobber(CLBR_EAX) - : "memory", "cc" PV_EXTRA_CLOBBERS); + : "memory", "cc"); } static inline void raw_local_irq_disable(void) { - asm volatile(paravirt_alt(PV_SAVE_REGS - PARAVIRT_CALL - PV_RESTORE_REGS) + asm volatile(paravirt_alt(PARAVIRT_CALL) : : paravirt_type(pv_irq_ops.irq_disable), paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); + : "memory", "eax", "cc"); } static inline void raw_local_irq_enable(void) { - asm volatile(paravirt_alt(PV_SAVE_REGS - PARAVIRT_CALL - PV_RESTORE_REGS) + asm volatile(paravirt_alt(PARAVIRT_CALL) : : paravirt_type(pv_irq_ops.irq_enable), paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); + : "memory", "eax", "cc"); } static inline unsigned long __raw_local_irq_save(void) @@ -1541,9 +1609,9 @@ static inline unsigned long __raw_local_irq_save(void) #define COND_PUSH(set, mask, reg) \ - .if ((~set) & mask); push %reg; .endif + .if ((~(set)) & mask); push %reg; .endif #define COND_POP(set, mask, reg) \ - .if ((~set) & mask); pop %reg; .endif + .if ((~(set)) & mask); pop %reg; .endif #ifdef CONFIG_X86_64 @@ -1594,15 +1662,15 @@ static inline unsigned long __raw_local_irq_save(void) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ - PV_SAVE_REGS(clobbers); \ + PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ - PV_RESTORE_REGS(clobbers);) + PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ - PV_SAVE_REGS(clobbers); \ + PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ - PV_RESTORE_REGS(clobbers);) + PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #define USERGS_SYSRET32 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index dd25e2b1593..8adb6b5aa42 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -310,10 +310,10 @@ struct pv_time_ops pv_time_ops = { struct pv_irq_ops pv_irq_ops = { .init_IRQ = native_init_IRQ, - .save_fl = native_save_fl, - .restore_fl = native_restore_fl, - .irq_disable = native_irq_disable, - .irq_enable = native_irq_enable, + .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), + .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), + .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), + .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .safe_halt = native_safe_halt, .halt = native_halt, #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index a688f3bfaec..c609205df59 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void) flags &= ~X86_EFLAGS_IF; return flags; } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); static void vsmp_restore_fl(unsigned long flags) { @@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags) flags |= X86_EFLAGS_AC; native_restore_fl(flags); } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); static void vsmp_irq_disable(void) { @@ -53,6 +55,7 @@ static void vsmp_irq_disable(void) native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); static void vsmp_irq_enable(void) { @@ -60,6 +63,7 @@ static void vsmp_irq_enable(void) native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable); static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void) cap, ctl); if (cap & ctl & (1 << 4)) { /* Setup irq ops and turn on vSMP IRQ fastpath handling */ - pv_irq_ops.irq_disable = vsmp_irq_disable; - pv_irq_ops.irq_enable = vsmp_irq_enable; - pv_irq_ops.save_fl = vsmp_save_fl; - pv_irq_ops.restore_fl = vsmp_restore_fl; + pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); + pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable); + pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); + pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); pv_init_ops.patch = vsmp_patch; ctl &= ~(1 << 4); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 92f1c6f3e19..19e33b6cd59 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -173,24 +173,29 @@ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } +PV_CALLEE_SAVE_REGS_THUNK(save_fl); /* restore_flags() just sets the flags back to the value given. */ static void restore_fl(unsigned long flags) { lguest_data.irq_enabled = flags; } +PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } +PV_CALLEE_SAVE_REGS_THUNK(irq_disable); /* Interrupts go on... */ static void irq_enable(void) { lguest_data.irq_enabled = X86_EFLAGS_IF; } +PV_CALLEE_SAVE_REGS_THUNK(irq_enable); + /*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, @@ -984,10 +989,10 @@ __init void lguest_init(void) /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; - pv_irq_ops.save_fl = save_fl; - pv_irq_ops.restore_fl = restore_fl; - pv_irq_ops.irq_disable = irq_disable; - pv_irq_ops.irq_enable = irq_enable; + pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); + pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); + pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); + pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0cd2a165f17..ff6d530ccc7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -676,10 +676,10 @@ void xen_setup_vcpu_info_placement(void) if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); - pv_irq_ops.save_fl = xen_save_fl_direct; - pv_irq_ops.restore_fl = xen_restore_fl_direct; - pv_irq_ops.irq_disable = xen_irq_disable_direct; - pv_irq_ops.irq_enable = xen_irq_enable_direct; + pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); + pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } } diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 2e8271431e1..5a070900ad3 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void) */ return (-flags) & X86_EFLAGS_IF; } +PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); static void xen_restore_fl(unsigned long flags) { @@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags) xen_force_evtchn_callback(); } } +PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); static void xen_irq_disable(void) { @@ -86,6 +88,7 @@ static void xen_irq_disable(void) percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; preempt_enable_no_resched(); } +PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); static void xen_irq_enable(void) { @@ -106,6 +109,7 @@ static void xen_irq_enable(void) if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); } +PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); static void xen_safe_halt(void) { @@ -124,10 +128,12 @@ static void xen_halt(void) static const struct pv_irq_ops xen_irq_ops __initdata = { .init_IRQ = __xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, + + .save_fl = PV_CALLEE_SAVE(xen_save_fl), + .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), + .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), + .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), + .safe_halt = xen_safe_halt, .halt = xen_halt, #ifdef CONFIG_X86_64 From 791bad9d28d405d9397ea0c370ffb7c7bdd2aa6e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:06 -0800 Subject: [PATCH 08/22] x86/paravirt: implement PVOP_CALL macros for callee-save functions Impact: Optimization Functions with the callee save calling convention clobber many fewer registers than the normal C calling convention. Implement variants of PVOP_V?CALL* accordingly. This only bothers with functions up to 3 args, since functions with more args may as well use the normal calling convention. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 134 +++++++++++++++++++++++--------- 1 file changed, 99 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f9107b88631..beb10ecdbe6 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -510,25 +510,45 @@ int paravirt_disable_iospace(void); * makes sure the incoming and outgoing types are always correct. */ #ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx +#define PVOP_VCALL_ARGS \ + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx #define PVOP_CALL_ARGS PVOP_VCALL_ARGS + +#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) + #define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ "=c" (__ecx) #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + #define EXTRA_CLOBBERS #define VEXTRA_CLOBBERS -#else -#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx +#else /* CONFIG_X86_64 */ +#define PVOP_VCALL_ARGS \ + unsigned long __edi = __edi, __esi = __esi, \ + __edx = __edx, __ecx = __ecx #define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + +#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) + #define PVOP_VCALL_CLOBBERS "=D" (__edi), \ "=S" (__esi), "=d" (__edx), \ "=c" (__ecx) - #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + #define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" #define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" -#endif +#endif /* CONFIG_X86_32 */ #ifdef CONFIG_PARAVIRT_DEBUG #define PVOP_TEST_NULL(op) BUG_ON(op == NULL) @@ -536,10 +556,11 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)op) #endif -#define __PVOP_CALL(rettype, op, pre, post, ...) \ +#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ + pre, post, ...) \ ({ \ rettype __ret; \ - PVOP_CALL_ARGS; \ + PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ /* This is 32-bit specific, but is okay in 64-bit */ \ /* since this condition will never hold */ \ @@ -547,70 +568,113 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : PVOP_CALL_CLOBBERS \ + : call_clbr \ : paravirt_type(op), \ - paravirt_clobber(CLBR_ANY), \ + paravirt_clobber(clbr), \ ##__VA_ARGS__ \ - : "memory", "cc" EXTRA_CLOBBERS); \ + : "memory", "cc" extra_clbr); \ __ret = (rettype)((((u64)__edx) << 32) | __eax); \ } else { \ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : PVOP_CALL_CLOBBERS \ + : call_clbr \ : paravirt_type(op), \ - paravirt_clobber(CLBR_ANY), \ + paravirt_clobber(clbr), \ ##__VA_ARGS__ \ - : "memory", "cc" EXTRA_CLOBBERS); \ + : "memory", "cc" extra_clbr); \ __ret = (rettype)__eax; \ } \ __ret; \ }) -#define __PVOP_VCALL(op, pre, post, ...) \ + +#define __PVOP_CALL(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ + EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + +#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ ({ \ PVOP_VCALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : PVOP_VCALL_CLOBBERS \ + : call_clbr \ : paravirt_type(op), \ - paravirt_clobber(CLBR_ANY), \ + paravirt_clobber(clbr), \ ##__VA_ARGS__ \ - : "memory", "cc" VEXTRA_CLOBBERS); \ + : "memory", "cc" extra_clbr); \ }) +#define __PVOP_VCALL(op, pre, post, ...) \ + ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, \ + pre, post, ##__VA_ARGS__) + +#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + + #define PVOP_CALL0(rettype, op) \ __PVOP_CALL(rettype, op, "", "") #define PVOP_VCALL0(op) \ __PVOP_VCALL(op, "", "") +#define PVOP_CALLEE0(rettype, op) \ + __PVOP_CALLEESAVE(rettype, op, "", "") +#define PVOP_VCALLEE0(op) \ + __PVOP_VCALLEESAVE(op, "", "") + + #define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) #define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + +#define PVOP_CALLEE1(rettype, op, arg1) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALLEE1(op, arg1) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + #define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ - "1" ((unsigned long)(arg2))) + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) #define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ - "1" ((unsigned long)(arg2))) + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + +#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALLEE2(op, arg1, arg2) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ - "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) #define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ - "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) /* This is the only difference in x86_64. We can make it much simpler */ #ifdef CONFIG_X86_32 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ __PVOP_CALL(rettype, op, \ "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ __PVOP_VCALL(op, \ "push %[_arg4];", "lea 4(%%esp),%%esp;", \ @@ -618,13 +682,13 @@ int paravirt_disable_iospace(void); "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) #else #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ - "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ - "3"((unsigned long)(arg4))) + __PVOP_CALL(rettype, op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ - "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ - "3"((unsigned long)(arg4))) + __PVOP_VCALL(op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) #endif static inline int paravirt_enabled(void) From da5de7c22eb705be709a57e486e7475a6969b994 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:07 -0800 Subject: [PATCH 09/22] x86/paravirt: use callee-saved convention for pte_val/make_pte/etc Impact: Optimization In the native case, pte_val, make_pte, etc are all just identity functions, so there's no need to clobber a lot of registers over them. (This changes the 32-bit callee-save calling convention to return both EAX and EDX so functions can return 64-bit values.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 78 ++++++++++++++++----------------- arch/x86/kernel/paravirt.c | 53 +++++----------------- arch/x86/xen/mmu.c | 24 ++++++---- 3 files changed, 67 insertions(+), 88 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index beb10ecdbe6..2d098b78bc1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -19,7 +19,7 @@ #define CLBR_ANY ((1 << 4) - 1) #define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX) +#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) #define CLBR_SCRATCH (0) #else #define CLBR_RAX CLBR_EAX @@ -308,11 +308,11 @@ struct pv_mmu_ops { void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); - pteval_t (*pte_val)(pte_t); - pte_t (*make_pte)(pteval_t pte); + struct paravirt_callee_save pte_val; + struct paravirt_callee_save make_pte; - pgdval_t (*pgd_val)(pgd_t); - pgd_t (*make_pgd)(pgdval_t pgd); + struct paravirt_callee_save pgd_val; + struct paravirt_callee_save make_pgd; #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE @@ -327,12 +327,12 @@ struct pv_mmu_ops { void (*set_pud)(pud_t *pudp, pud_t pudval); - pmdval_t (*pmd_val)(pmd_t); - pmd_t (*make_pmd)(pmdval_t pmd); + struct paravirt_callee_save pmd_val; + struct paravirt_callee_save make_pmd; #if PAGETABLE_LEVELS == 4 - pudval_t (*pud_val)(pud_t); - pud_t (*make_pud)(pudval_t pud); + struct paravirt_callee_save pud_val; + struct paravirt_callee_save make_pud; void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); #endif /* PAGETABLE_LEVELS == 4 */ @@ -1155,13 +1155,13 @@ static inline pte_t __pte(pteval_t val) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALL2(pteval_t, - pv_mmu_ops.make_pte, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pteval_t, + pv_mmu_ops.make_pte, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pteval_t, - pv_mmu_ops.make_pte, - val); + ret = PVOP_CALLEE1(pteval_t, + pv_mmu_ops.make_pte, + val); return (pte_t) { .pte = ret }; } @@ -1171,11 +1171,11 @@ static inline pteval_t pte_val(pte_t pte) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, - pte.pte, (u64)pte.pte >> 32); + ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val, + pte.pte, (u64)pte.pte >> 32); else - ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val, - pte.pte); + ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val, + pte.pte); return ret; } @@ -1185,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, - val); + ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd, + val); return (pgd_t) { ret }; } @@ -1199,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, - pgd.pgd, (u64)pgd.pgd >> 32); + ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd, (u64)pgd.pgd >> 32); else - ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, - pgd.pgd); + ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd); return ret; } @@ -1267,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, - val); + ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd, + val); return (pmd_t) { ret }; } @@ -1281,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, - pmd.pmd, (u64)pmd.pmd >> 32); + ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd, (u64)pmd.pmd >> 32); else - ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, - pmd.pmd); + ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd); return ret; } @@ -1307,11 +1307,11 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; if (sizeof(pudval_t) > sizeof(long)) - ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, - val); + ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud, + val); return (pud_t) { ret }; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8adb6b5aa42..cea11c8e304 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -391,43 +391,12 @@ struct pv_apic_ops pv_apic_ops = { #endif }; -typedef pte_t make_pte_t(pteval_t); -typedef pmd_t make_pmd_t(pmdval_t); -typedef pud_t make_pud_t(pudval_t); -typedef pgd_t make_pgd_t(pgdval_t); - -typedef pteval_t pte_val_t(pte_t); -typedef pmdval_t pmd_val_t(pmd_t); -typedef pudval_t pud_val_t(pud_t); -typedef pgdval_t pgd_val_t(pgd_t); - - #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) /* 32-bit pagetable entries */ -#define paravirt_native_make_pte (make_pte_t *)_paravirt_ident_32 -#define paravirt_native_pte_val (pte_val_t *)_paravirt_ident_32 - -#define paravirt_native_make_pmd (make_pmd_t *)_paravirt_ident_32 -#define paravirt_native_pmd_val (pmd_val_t *)_paravirt_ident_32 - -#define paravirt_native_make_pud (make_pud_t *)_paravirt_ident_32 -#define paravirt_native_pud_val (pud_val_t *)_paravirt_ident_32 - -#define paravirt_native_make_pgd (make_pgd_t *)_paravirt_ident_32 -#define paravirt_native_pgd_val (pgd_val_t *)_paravirt_ident_32 +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) #else /* 64-bit pagetable entries */ -#define paravirt_native_make_pte (make_pte_t *)_paravirt_ident_64 -#define paravirt_native_pte_val (pte_val_t *)_paravirt_ident_64 - -#define paravirt_native_make_pmd (make_pmd_t *)_paravirt_ident_64 -#define paravirt_native_pmd_val (pmd_val_t *)_paravirt_ident_64 - -#define paravirt_native_make_pud (make_pud_t *)_paravirt_ident_64 -#define paravirt_native_pud_val (pud_val_t *)_paravirt_ident_64 - -#define paravirt_native_make_pgd (make_pgd_t *)_paravirt_ident_64 -#define paravirt_native_pgd_val (pgd_val_t *)_paravirt_ident_64 +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif struct pv_mmu_ops pv_mmu_ops = { @@ -481,21 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, - .pmd_val = paravirt_native_pmd_val, - .make_pmd = paravirt_native_make_pmd, + + .pmd_val = PTE_IDENT, + .make_pmd = PTE_IDENT, #if PAGETABLE_LEVELS == 4 - .pud_val = paravirt_native_pud_val, - .make_pud = paravirt_native_make_pud, + .pud_val = PTE_IDENT, + .make_pud = PTE_IDENT, + .set_pgd = native_set_pgd, #endif #endif /* PAGETABLE_LEVELS >= 3 */ - .pte_val = paravirt_native_pte_val, - .pgd_val = paravirt_native_pgd_val, + .pte_val = PTE_IDENT, + .pgd_val = PTE_IDENT, - .make_pte = paravirt_native_make_pte, - .make_pgd = paravirt_native_make_pgd, + .make_pte = PTE_IDENT, + .make_pgd = PTE_IDENT, .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 94e452c0b00..5e41f7fc6cf 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -492,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte) { return pte_mfn_to_pfn(pte.pte); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); pgdval_t xen_pgd_val(pgd_t pgd) { return pte_mfn_to_pfn(pgd.pgd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); return native_make_pgd(pgd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); void xen_set_pud_hyper(pud_t *ptr, pud_t val) { @@ -590,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd) pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); #if PAGETABLE_LEVELS == 4 pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); pud_t xen_make_pud(pudval_t pud) { @@ -603,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud) return native_make_pud(pud); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); pgd_t *xen_get_user_pgd(pgd_t *pgd) { @@ -1813,11 +1821,11 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, - .pte_val = xen_pte_val, - .pgd_val = xen_pgd_val, + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - .make_pte = xen_make_pte, - .make_pgd = xen_make_pgd, + .make_pte = PV_CALLEE_SAVE(xen_make_pte), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), #ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, @@ -1827,12 +1835,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { #endif /* CONFIG_X86_PAE */ .set_pud = xen_set_pud_hyper, - .make_pmd = xen_make_pmd, - .pmd_val = xen_pmd_val, + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), #if PAGETABLE_LEVELS == 4 - .pud_val = xen_pud_val, - .make_pud = xen_make_pud, + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_pgd = xen_set_pgd_hyper, .alloc_pud = xen_alloc_pte_init, From 4767afbf1f60f73997a7eb69a86d380f1fb27a92 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 29 Jan 2009 01:51:34 -0800 Subject: [PATCH 10/22] x86/paravirt: fix missing callee-save call on pud_val Impact: Fix build when CONFIG_PARAVIRT_DEBUG is enabled Fix missed convertion to using callee-saved calls for pud_val, which causes a compile error when CONFIG_PARAVIRT_DEBUG is enabled. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2d098b78bc1..b17365c3974 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1321,11 +1321,11 @@ static inline pudval_t pud_val(pud_t pud) pudval_t ret; if (sizeof(pudval_t) > sizeof(long)) - ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, - pud.pud, (u64)pud.pud >> 32); + ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val, + pud.pud, (u64)pud.pud >> 32); else - ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, - pud.pud); + ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val, + pud.pud); return ret; } From 2749ebe320ff9f77548d10fcc0a3464ac21c8e58 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 29 Jan 2009 15:35:26 -0600 Subject: [PATCH 11/22] x86: UV fix uv_flush_send_and_wait() Impact: fix possible tlb mis-flushing on UV uv_flush_send_and_wait() should return a pointer if the broadcast remote tlb shootdown requests fail. That causes the conventional IPI method of shootdown to be used. Signed-off-by: Cliff Wickman Signed-off-by: Tejun Heo --- arch/x86/kernel/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 89fce1b6d01..f4b2f27d19b 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -259,7 +259,7 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade, * the cpu's, all of which are still in the mask. */ __get_cpu_var(ptcstats).ptc_i++; - return 0; + return flush_mask; } /* From 552be871e67ff577ed36beb2f53d078b42304739 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 30 Jan 2009 17:47:53 +0900 Subject: [PATCH 12/22] x86: pass in cpu number to switch_to_new_gdt() Impact: cleanup, prepare for xen boot fix. Xen needs to call this function very early to setup the GDT and per-cpu segments. Remove the call to smp_processor_id() and just pass in the cpu number. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 7 +++---- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/mach-voyager/voyager_smp.c | 11 ++++++----- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index befa20b4a68..1c25eb69ea8 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -768,7 +768,7 @@ extern int sysenter_setup(void); extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); -extern void switch_to_new_gdt(void); +extern void switch_to_new_gdt(int); extern void cpu_init(void); static inline unsigned long get_debugctlmsr(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 652fdc9a757..6eacd64b602 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -255,10 +255,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; /* Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ -void switch_to_new_gdt(void) +void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr; - int cpu = smp_processor_id(); gdt_descr.address = (long)get_cpu_gdt_table(cpu); gdt_descr.size = GDT_SIZE - 1; @@ -993,7 +992,7 @@ void __cpuinit cpu_init(void) * and set up the GDT descriptor: */ - switch_to_new_gdt(); + switch_to_new_gdt(cpu); loadsegment(fs, 0); load_idt((const struct desc_ptr *)&idt_descr); @@ -1098,7 +1097,7 @@ void __cpuinit cpu_init(void) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_idt(&idt_descr); - switch_to_new_gdt(); + switch_to_new_gdt(cpu); /* * Set up and load the per-CPU TSS and LDT diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0d1e7ac439f..ef91747bbed 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -122,7 +122,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) - switch_to_new_gdt(); + switch_to_new_gdt(cpu); DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f9dbcff4354..612d3c74f6a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1185,7 +1185,7 @@ out: void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(); + switch_to_new_gdt(me); /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); per_cpu(cpu_state, me) = CPU_ONLINE; diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 331cd6d5648..58c7cac3440 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1746,12 +1746,13 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus) static void __cpuinit voyager_smp_prepare_boot_cpu(void) { - switch_to_new_gdt(); + int cpu = smp_processor_id(); + switch_to_new_gdt(cpu); - cpu_set(smp_processor_id(), cpu_online_map); - cpu_set(smp_processor_id(), cpu_callout_map); - cpu_set(smp_processor_id(), cpu_possible_map); - cpu_set(smp_processor_id(), cpu_present_map); + cpu_set(cpu, cpu_online_map); + cpu_set(cpu, cpu_callout_map); + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); } static int __cpuinit voyager_cpu_up(unsigned int cpu) From 11e3a840cd5b731cdd8f6f956dfae78a8046d09c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 30 Jan 2009 17:47:54 +0900 Subject: [PATCH 13/22] x86: split loading percpu segments from loading gdt Impact: split out a function, no functional change Xen needs to be able to access percpu data from very early on. For various reasons, it cannot also load the gdt at that time. It does, however, have a pefectly functional gdt at that point, so there's no pressing need to reload the gdt. Split the function to load the segment registers off, so Xen can call it directly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Tejun Heo --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1c25eb69ea8..656d02ea509 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -769,6 +769,7 @@ extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); +extern void load_percpu_segment(int); extern void cpu_init(void); static inline unsigned long get_debugctlmsr(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6eacd64b602..0f73ea42308 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -253,6 +253,16 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +void load_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + loadsegment(fs, __KERNEL_PERCPU); +#else + loadsegment(gs, 0); + wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); +#endif +} + /* Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ void switch_to_new_gdt(int cpu) @@ -263,12 +273,8 @@ void switch_to_new_gdt(int cpu) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); /* Reload the per-cpu base */ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - loadsegment(gs, 0); - wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); -#endif + + load_percpu_segment(cpu); } static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; From 795f99b61d20c34cb04d17d8906b32f745a635ec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 30 Jan 2009 17:47:54 +0900 Subject: [PATCH 14/22] xen: setup percpu data pointers Impact: fix xen booting We need to access percpu data fairly early, so set up the percpu registers as soon as possible. We only need to load the appropriate segment register. We already have a GDT, but its hard to change it early because we need to manipulate the pagetable to do so, and that hasn't been set up yet. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Tejun Heo --- arch/x86/xen/enlighten.c | 3 +++ arch/x86/xen/smp.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bef941f6145..fe19c88a502 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1647,6 +1647,9 @@ asmlinkage void __init xen_start_kernel(void) have_vcpu_info_placement = 0; #endif + /* setup percpu state */ + load_percpu_segment(0); + xen_smp_init(); /* Get mfn list */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7735e3dd359..88d5d5ec6be 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -170,7 +170,8 @@ static void __init xen_smp_prepare_boot_cpu(void) /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); + make_lowmem_page_readwrite(__per_cpu_load + + (unsigned long)&per_cpu_var(gdt_page)); xen_setup_vcpu_info_placement(); } @@ -235,6 +236,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->user_regs.ss = __KERNEL_DS; #ifdef CONFIG_X86_32 ctxt->user_regs.fs = __KERNEL_PERCPU; +#else + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ From e584f559c7b8711cccdf319400acd6294b2c074e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 30 Jan 2009 23:17:23 -0800 Subject: [PATCH 15/22] x86/paravirt: don't restore second return reg Impact: bugfix In the 32-bit calling convention, %eax:%edx is used to return 64-bit values. Don't save and restore %edx around wrapped functions, or they can't return a full 64-bit result. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index b17365c3974..016dce31130 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1524,8 +1524,8 @@ extern struct paravirt_patch_site __parainstructions[], #define PV_RESTORE_REGS "popl %edx; popl %ecx;" /* save and restore all caller-save registers, except return value */ -#define PV_SAVE_ALL_CALLER_REGS PV_SAVE_REGS -#define PV_RESTORE_ALL_CALLER_REGS PV_RESTORE_REGS +#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" +#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" #define PV_FLAGS_ARG "0" #define PV_EXTRA_CLOBBERS From 664c7954721adfc9bd61de6ec78f89f482f1a802 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 30 Jan 2009 23:18:41 -0800 Subject: [PATCH 16/22] x86/vmi: fix interrupt enable/disable/save/restore calling convention. Zach says: > Enable/Disable have no clobbers at all. > Save clobbers only return value, %eax > Restore also clobbers nothing. This is precisely compatible with the calling convention, so we can just call them directly without wrapping. (Compile tested only.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmi_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 1d3302cc2dd..eb9e7347928 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -670,10 +670,11 @@ static inline int __init activate_vmi(void) para_fill(pv_mmu_ops.write_cr2, SetCR2); para_fill(pv_mmu_ops.write_cr3, SetCR3); para_fill(pv_cpu_ops.write_cr4, SetCR4); - para_fill(pv_irq_ops.save_fl, GetInterruptMask); - para_fill(pv_irq_ops.restore_fl, SetInterruptMask); - para_fill(pv_irq_ops.irq_disable, DisableInterrupts); - para_fill(pv_irq_ops.irq_enable, EnableInterrupts); + + para_fill(pv_irq_ops.save_fl.func, GetInterruptMask); + para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask); + para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts); + para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts); para_fill(pv_cpu_ops.wbinvd, WBINVD); para_fill(pv_cpu_ops.read_tsc, RDTSC); From ef3892bd63420380d115f755d351d2071f1f805f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Feb 2009 18:16:19 -0800 Subject: [PATCH 17/22] x86, percpu: fix kexec with vmlinux Impact: fix regression with kexec with vmlinux Split data.init into data.init, percpu, data.init2 sections instead of let data.init wrap percpu secion. Thus kexec loading will be happy, because sections will not overlap. Before the patch we have: Elf file type is EXEC (Executable file) Entry point 0x200000 There are 6 program headers, starting at offset 64 Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000200000 0xffffffff80200000 0x0000000000200000 0x0000000000ca6000 0x0000000000ca6000 R E 200000 LOAD 0x0000000000ea6000 0xffffffff80ea6000 0x0000000000ea6000 0x000000000014dfe0 0x000000000014dfe0 RWE 200000 LOAD 0x0000000001000000 0xffffffffff600000 0x0000000000ff4000 0x0000000000000888 0x0000000000000888 RWE 200000 LOAD 0x00000000011f6000 0xffffffff80ff6000 0x0000000000ff6000 0x0000000000073086 0x0000000000a2d938 RWE 200000 LOAD 0x0000000001400000 0x0000000000000000 0x000000000106a000 0x00000000001d2ce0 0x00000000001d2ce0 RWE 200000 NOTE 0x00000000009e2c1c 0xffffffff809e2c1c 0x00000000009e2c1c 0x0000000000000024 0x0000000000000024 4 Section to Segment mapping: Segment Sections... 00 .text .notes __ex_table .rodata __bug_table .pci_fixup .builtin_fw __ksymtab __ksymtab_gpl __ksymtab_strings __init_rodata __param 01 .data .init.rodata .data.cacheline_aligned .data.read_mostly 02 .vsyscall_0 .vsyscall_fn .vsyscall_gtod_data .vsyscall_1 .vsyscall_2 .vgetcpu_mode .jiffies 03 .data.init_task .smp_locks .init.text .init.data .init.setup .initcall.init .con_initcall.init .x86_cpu_dev.init .altinstructions .altinstr_replacement .exit.text .init.ramfs .bss 04 .data.percpu 05 .notes After patch we've got: Elf file type is EXEC (Executable file) Entry point 0x200000 There are 7 program headers, starting at offset 64 Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000200000 0xffffffff80200000 0x0000000000200000 0x0000000000ca6000 0x0000000000ca6000 R E 200000 LOAD 0x0000000000ea6000 0xffffffff80ea6000 0x0000000000ea6000 0x000000000014dfe0 0x000000000014dfe0 RWE 200000 LOAD 0x0000000001000000 0xffffffffff600000 0x0000000000ff4000 0x0000000000000888 0x0000000000000888 RWE 200000 LOAD 0x00000000011f6000 0xffffffff80ff6000 0x0000000000ff6000 0x0000000000073086 0x0000000000073086 RWE 200000 LOAD 0x0000000001400000 0x0000000000000000 0x000000000106a000 0x00000000001d2ce0 0x00000000001d2ce0 RWE 200000 LOAD 0x000000000163d000 0xffffffff8123d000 0x000000000123d000 0x0000000000000000 0x00000000007e6938 RWE 200000 NOTE 0x00000000009e2c1c 0xffffffff809e2c1c 0x00000000009e2c1c 0x0000000000000024 0x0000000000000024 4 Section to Segment mapping: Segment Sections... 00 .text .notes __ex_table .rodata __bug_table .pci_fixup .builtin_fw __ksymtab __ksymtab_gpl __ksymtab_strings __init_rodata __param 01 .data .init.rodata .data.cacheline_aligned .data.read_mostly 02 .vsyscall_0 .vsyscall_fn .vsyscall_gtod_data .vsyscall_1 .vsyscall_2 .vgetcpu_mode .jiffies 03 .data.init_task .smp_locks .init.text .init.data .init.setup .initcall.init .con_initcall.init .x86_cpu_dev.init .altinstructions .altinstr_replacement .exit.text .init.ramfs 04 .data.percpu 05 .bss 06 .notes Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index c9740996430..07f62d287ff 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -22,6 +22,7 @@ PHDRS { #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS @@ -215,7 +216,7 @@ SECTIONS /* * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the * output PHDR, so the next output section - __data_nosave - should - * switch it back to data.init. Also, pda should be at the head of + * start another section data.init2. Also, pda should be at the head of * percpu area. Preallocate it and define the percpu offset symbol * so that it can be accessed as a percpu variable. */ @@ -232,7 +233,7 @@ SECTIONS __nosave_begin = .; .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) - } :data.init /* switch back to data.init, see PERCPU_VADDR() above */ + } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ . = ALIGN(PAGE_SIZE); __nosave_end = .; From 0eb592dbba40baebec9cdde3ff4574185de6cbcc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 3 Feb 2009 16:00:38 -0800 Subject: [PATCH 18/22] x86/paravirt: return full 64-bit result Impact: Bug fix A hunk went missing in the original patch, and callee-save callsites were not marked as returning the upper 32-bit of result, causing Badness. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 016dce31130..c85e7475e17 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -522,7 +522,7 @@ int paravirt_disable_iospace(void); "=c" (__ecx) #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS #define EXTRA_CLOBBERS From 1f4f931501e9270c156d05ee76b7b872de486304 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 2 Feb 2009 13:58:06 -0800 Subject: [PATCH 19/22] xen: fix 32-bit build resulting from mmu move Moving the mmu code from enlighten.c to mmu.c inadvertently broke the 32-bit build. Fix it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e41f7fc6cf..d2e8ed1aff3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1396,6 +1396,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } +#ifdef CONFIG_HIGHPTE +static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) +{ + pgprot_t prot = PAGE_KERNEL; + + if (PagePinned(page)) + prot = PAGE_KERNEL_RO; + + if (0 && PageHighMem(page)) + printk("mapping highpte %lx type %d prot %s\n", + page_to_pfn(page), type, + (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); + + return kmap_atomic_prot(page, type, prot); +} +#endif + +#ifdef CONFIG_X86_32 +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); + + return pte; +} + +/* Init-time set_pte while constructing initial pagetables, which + doesn't allow RO pagetable pages to be remapped RW */ +static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + pte = mask_rw_pte(ptep, pte); + + xen_set_pte(ptep, pte); +} +#endif /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ From 383414322b3b3ced0cbc146801e0cc6c60a6c5f4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 2 Feb 2009 13:55:31 -0800 Subject: [PATCH 20/22] xen: setup percpu data pointers We need to access percpu data fairly early, so set up the percpu registers as soon as possible. We only need to load the appropriate segment register. We already have a GDT, but its hard to change it early because we need to manipulate the pagetable to do so, and that hasn't been set up yet. Also, set the kernel stack when bringing up secondary CPUs. If we don't they all end up sharing the same stack... Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 15 ++++++++++++++- arch/x86/xen/smp.c | 6 ++++-- arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cd022c43dfb..aed7ceeb4b6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -66,6 +66,8 @@ EXPORT_SYMBOL_GPL(xen_start_info); struct shared_info xen_dummy_shared_info; +void *xen_initial_gdt; + /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. @@ -917,8 +919,19 @@ asmlinkage void __init xen_start_kernel(void) have_vcpu_info_placement = 0; #endif - /* setup percpu state */ +#ifdef CONFIG_X86_64 + /* + * Setup percpu state. We only need to do this for 64-bit + * because 32-bit already has %fs set properly. + */ load_percpu_segment(0); +#endif + /* + * The only reliable way to retain the initial address of the + * percpu gdt_page is to remember it here, so we can go and + * mark it RW later, when the initial percpu area is freed. + */ + xen_initial_gdt = &per_cpu(gdt_page, 0); xen_smp_init(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 88d5d5ec6be..035582ae815 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -170,8 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void) /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ - make_lowmem_page_readwrite(__per_cpu_load + - (unsigned long)&per_cpu_var(gdt_page)); + make_lowmem_page_readwrite(xen_initial_gdt); xen_setup_vcpu_info_placement(); } @@ -287,6 +286,9 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) irq_ctx_init(cpu); #else clear_tsk_thread_flag(idle, TIF_FORK); + per_cpu(kernel_stack, cpu) = + (unsigned long)task_stack_page(idle) - + KERNEL_STACK_OFFSET + THREAD_SIZE; #endif xen_setup_timer(cpu); xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 11913fc94c1..2f5ef2632ea 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -10,6 +10,8 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +extern void *xen_initial_gdt; + struct trap_info; void xen_copy_trap_info(struct trap_info *traps); From 5393744b71ce797f1b1546fafaed127fc50c2b61 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 2 Feb 2009 13:55:42 -0800 Subject: [PATCH 21/22] xen: make direct versions of irq_enable/disable/save/restore to common code Now that x86-64 has directly accessible percpu variables, it can also implement the direct versions of these operations, which operate on a vcpu_info structure directly embedded in the percpu area. In fact, the 64-bit versions are more or less identical, and so can be shared. The only two differences are: 1. xen_restore_fl_direct takes its argument in eax on 32-bit, and rdi on 64-bit. Unfortunately it isn't possible to directly refer to the 2nd lsb of rdi directly (as you can with %ah), so the code isn't quite as dense. 2. check_events needs to variants to save different registers. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/Makefile | 3 +- arch/x86/xen/xen-asm.S | 140 ++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-asm.h | 12 ++++ arch/x86/xen/xen-asm_32.S | 111 ++++-------------------------- arch/x86/xen/xen-asm_64.S | 134 +----------------------------------- 5 files changed, 169 insertions(+), 231 deletions(-) create mode 100644 arch/x86/xen/xen-asm.S create mode 100644 arch/x86/xen/xen-asm.h diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 6dcefba7836..3b767d03fd6 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg endif obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ - time.o xen-asm_$(BITS).o grant-table.o suspend.o + time.o xen-asm.o xen-asm_$(BITS).o \ + grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S new file mode 100644 index 00000000000..4c6f9679913 --- /dev/null +++ b/arch/x86/xen/xen-asm.S @@ -0,0 +1,140 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in percpu data) of + the operations here; the indirect forms are better handled in + C, since they're generally too large to inline anyway. + */ + +#include +#include +#include + +#include "xen-asm.h" + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + jz 1f + +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) +#ifdef CONFIG_X86_64 + testw $X86_EFLAGS_IF, %di +#else + testb $X86_EFLAGS_IF>>8, %ah +#endif + setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: +#ifdef CONFIG_X86_32 + push %eax + push %ecx + push %edx + call xen_force_evtchn_callback + pop %edx + pop %ecx + pop %eax +#else + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call xen_force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax +#endif + ret + diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h new file mode 100644 index 00000000000..465276467a4 --- /dev/null +++ b/arch/x86/xen/xen-asm.h @@ -0,0 +1,12 @@ +#ifndef _XEN_XEN_ASM_H +#define _XEN_XEN_ASM_H + +#include + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 42786f59d9c..082d173caaf 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -11,101 +11,28 @@ generally too large to inline anyway. */ -#include - -#include +//#include #include -#include #include #include #include -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 +#include "xen-asm.h" /* - Enable events. This clears the event mask and tests the pending - event status with one and operation. If there are pending - events, then enter the hypervisor to get them handled. + Force an event check by making a hypercall, + but preserve regs before making the call. */ -ENTRY(xen_irq_enable_direct) - /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f - -2: call check_events -1: -ENDPATCH(xen_irq_enable_direct) +check_events: + push %eax + push %ecx + push %edx + call xen_force_evtchn_callback + pop %edx + pop %ecx + pop %eax ret - ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) - - -/* - Disabling events is simply a matter of making the event mask - non-zero. - */ -ENTRY(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) - ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) - -/* - (xen_)save_fl is used to get the current interrupt enable status. - Callers expect the status to be in X86_EFLAGS_IF, and other bits - may be set in the return value. We take advantage of this by - making sure that X86_EFLAGS_IF has the right value (and other bits - in that byte are 0), but other bits in the return value are - undefined. We need to toggle the state of the bit, because - Xen and x86 use opposite senses (mask vs enable). - */ -ENTRY(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - setz %ah - addb %ah,%ah -ENDPATCH(xen_save_fl_direct) - ret - ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) - - -/* - In principle the caller should be passing us a value return - from xen_save_fl_direct, but for robustness sake we test only - the X86_EFLAGS_IF flag rather than the whole byte. After - setting the interrupt mask state, it checks for unmasked - pending events and enters the hypervisor to get them delivered - if so. - */ -ENTRY(xen_restore_fl_direct) - testb $X86_EFLAGS_IF>>8, %ah - setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f -2: call check_events -1: -ENDPATCH(xen_restore_fl_direct) - ret - ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) /* We can't use sysexit directly, because we're not running in ring0. @@ -289,17 +216,3 @@ ENTRY(xen_iret_crit_fixup) lea 4(%edi),%esp /* point esp to new frame */ 2: jmp xen_do_upcall - -/* - Force an event check by making a hypercall, - but preserve regs before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call xen_force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index d6fc51f4ce8..d205a283efe 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -11,142 +11,14 @@ generally too large to inline anyway. */ -#include - -#include -#include #include -#include #include +#include +#include #include -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -#if 1 -/* - FIXME: x86_64 now can support direct access to percpu variables - via a segment override. Update xen accordingly. - */ -#define BUG ud2a -#endif - -/* - Enable events. This clears the event mask and tests the pending - event status with one and operation. If there are pending - events, then enter the hypervisor to get them handled. - */ -ENTRY(xen_irq_enable_direct) - BUG - - /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jz 1f - -2: call check_events -1: -ENDPATCH(xen_irq_enable_direct) - ret - ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) - -/* - Disabling events is simply a matter of making the event mask - non-zero. - */ -ENTRY(xen_irq_disable_direct) - BUG - - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) - ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) - -/* - (xen_)save_fl is used to get the current interrupt enable status. - Callers expect the status to be in X86_EFLAGS_IF, and other bits - may be set in the return value. We take advantage of this by - making sure that X86_EFLAGS_IF has the right value (and other bits - in that byte are 0), but other bits in the return value are - undefined. We need to toggle the state of the bit, because - Xen and x86 use opposite senses (mask vs enable). - */ -ENTRY(xen_save_fl_direct) - BUG - - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - setz %ah - addb %ah,%ah -ENDPATCH(xen_save_fl_direct) - ret - ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) - -/* - In principle the caller should be passing us a value return - from xen_save_fl_direct, but for robustness sake we test only - the X86_EFLAGS_IF flag rather than the whole byte. After - setting the interrupt mask state, it checks for unmasked - pending events and enters the hypervisor to get them delivered - if so. - */ -ENTRY(xen_restore_fl_direct) - BUG - - testb $X86_EFLAGS_IF>>8, %ah - setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jz 1f -2: call check_events -1: -ENDPATCH(xen_restore_fl_direct) - ret - ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) - - -/* - Force an event check by making a hypercall, - but preserve regs before making the call. - */ -check_events: - push %rax - push %rcx - push %rdx - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - call xen_force_evtchn_callback - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rdx - pop %rcx - pop %rax - ret +#include "xen-asm.h" ENTRY(xen_adjust_exception_frame) mov 8+0(%rsp),%rcx From e4d0407185cdbdcfd99fc23bde2e5454bbc46329 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 2 Feb 2009 13:55:54 -0800 Subject: [PATCH 22/22] xen: use direct ops on 64-bit Enable the use of the direct vcpu-access operations on 64-bit. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index aed7ceeb4b6..37230342c2c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -87,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = -#ifdef CONFIG_X86_32 - 1 -#else - 0 -#endif - ; - +static int have_vcpu_info_placement = 1; static void xen_vcpu_setup(int cpu) { @@ -914,11 +907,6 @@ asmlinkage void __init xen_start_kernel(void) machine_ops = xen_machine_ops; -#ifdef CONFIG_X86_64 - /* Disable until direct per-cpu data access. */ - have_vcpu_info_placement = 0; -#endif - #ifdef CONFIG_X86_64 /* * Setup percpu state. We only need to do this for 64-bit