Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86/asm changes from Ingo Molnar * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86: Include probe_roms.h in probe_roms.c x86/32: Print control and debug registers for kerenel context x86: Tighten dependencies of CPU_SUP_*_32 x86/numa: Improve internode cache alignment x86: Fix the NMI nesting comments x86-64: Improve insn scheduling in SAVE_ARGS_IRQ x86-64: Fix CFI annotations for NMI nesting code bitops: Add missing parentheses to new get_order macro bitops: Optimise get_order() bitops: Adjust the comment on get_order() to describe the size==0 case x86/spinlocks: Eliminate TICKET_MASK x86-64: Handle byte-wise tail copying in memcpy() without a loop x86-64: Fix memcpy() to support sizes of 4Gb and above x86-64: Fix memset() to support sizes of 4Gb and above x86-64: Slightly shorten copy_page()
2012-03-22 09:13:24 -07:00 · 2012-03-22 09:13:24 -07:00 · e17fdf5c67
parent 95211279c5 a240ada241
commit e17fdf5c67
10 changed files with 128 additions and 98 deletions
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@ -303,7 +303,6 @@ config X86_GENERIC
 config X86_INTERNODE_CACHE_SHIFT
 	int
 	default "12" if X86_VSMP
 	default "7" if NUMA
 	default X86_L1_CACHE_SHIFT
 config X86_CMPXCHG
@ -441,7 +440,7 @@ config CPU_SUP_INTEL
 config CPU_SUP_CYRIX_32
 	default y
 	bool "Support Cyrix processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on M386 || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT)
 	---help---
 	  This enables detection, tunings and quirks for Cyrix processors
@ -495,7 +494,7 @@ config CPU_SUP_TRANSMETA_32
 config CPU_SUP_UMC_32
 	default y
 	bool "Support UMC processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on M386 || M486 || (EXPERT && !64BIT)
 	---help---
 	  This enables detection, tunings and quirks for UMC processors
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@ -88,14 +88,14 @@ static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
-	return !!(tmp.tail ^ tmp.head);
+	return tmp.tail != tmp.head;
 }
 static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
-	return ((tmp.tail - tmp.head) & TICKET_MASK) > 1;
+	return (__ticket_t)(tmp.tail - tmp.head) > 1;
 }
 #ifndef CONFIG_PARAVIRT_SPINLOCKS
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@ -16,7 +16,6 @@ typedef u32 __ticketpair_t;
 #endif
 #define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
 #define TICKET_MASK	((__ticket_t)((1 << TICKET_SHIFT) - 1))
 typedef struct arch_spinlock {
 	union {
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs)
 	int i;
 	print_modules();
-	__show_regs(regs, 0);
+	__show_regs(regs, !user_mode_vm(regs));
 	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
 		TASK_COMM_LEN, current->comm, task_pid_nr(current),
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@ -320,7 +320,7 @@ ENDPROC(native_usergs_sysret64)
 	movq %rsp, %rsi
 	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
-	testl $3, CS(%rdi)
+	testl $3, CS-RBP(%rsi)
 	je 1f
 	SWAPGS
 	/*
@ -330,11 +330,10 @@ ENDPROC(native_usergs_sysret64)
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
 1:	incl PER_CPU_VAR(irq_count)
-	jne 2f
+	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	mov PER_CPU_VAR(irq_stack_ptr),%rsp
 	CFI_DEF_CFA_REGISTER	rsi
-2:	/* Store previous stack value */
+	/* Store previous stack value */
 	pushq %rsi
 	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
 			0x77 /* DW_OP_breg7 */, 0, \
@ -1530,6 +1529,7 @@ ENTRY(nmi)
 	/* Use %rdx as out temp variable throughout */
 	pushq_cfi %rdx
 	CFI_REL_OFFSET rdx, 0
 	/*
 	 * If %cs was not the kernel segment, then the NMI triggered in user
@ -1554,6 +1554,7 @@ ENTRY(nmi)
 	 */
 	lea 6*8(%rsp), %rdx
 	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
 	CFI_REMEMBER_STATE
 nested_nmi:
 	/*
@ -1585,10 +1586,12 @@ nested_nmi:
 nested_nmi_out:
 	popq_cfi %rdx
 	CFI_RESTORE rdx
 	/* No need to check faults here */
 	INTERRUPT_RETURN
 	CFI_RESTORE_STATE
 first_nmi:
 	/*
 	 * Because nested NMIs will use the pushed location that we
@ -1620,10 +1623,15 @@ first_nmi:
 	 * | pt_regs                 |
 	 * +-------------------------+
 	 *
-	 * The saved RIP is used to fix up the copied RIP that a nested
+	 * The saved stack frame is used to fix up the copied stack frame
-	 * NMI may zero out. The original stack frame and the temp storage
+	 * that a nested NMI may change to make the interrupted NMI iret jump
 	 * to the repeat_nmi. The original stack frame and the temp storage
 	 * is also used by nested NMIs and can not be trusted on exit.
 	 */
 	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
 	movq (%rsp), %rdx
 	CFI_RESTORE rdx
 	/* Set the NMI executing variable on the stack. */
 	pushq_cfi $1
@ -1631,22 +1639,39 @@ first_nmi:
 	.rept 5
 	pushq_cfi 6*8(%rsp)
 	.endr
 	CFI_DEF_CFA_OFFSET SS+8-RIP
 	/* Everything up to here is safe from nested NMIs */
 	/*
 	 * If there was a nested NMI, the first NMI's iret will return
 	 * here. But NMIs are still enabled and we can take another
 	 * nested NMI. The nested NMI checks the interrupted RIP to see
 	 * if it is between repeat_nmi and end_repeat_nmi, and if so
 	 * it will just return, as we are about to repeat an NMI anyway.
 	 * This makes it safe to copy to the stack frame that a nested
 	 * NMI will update.
 	 */
 repeat_nmi:
 	/*
 	 * Update the stack variable to say we are still in NMI (the update
 	 * is benign for the non-repeat case, where 1 was pushed just above
 	 * to this very stack slot).
 	 */
 	movq $1, 5*8(%rsp)
 	/* Make another copy, this one may be modified by nested NMIs */
 	.rept 5
 	pushq_cfi 4*8(%rsp)
 	.endr
-
+	CFI_DEF_CFA_OFFSET SS+8-RIP
-	/* Do not pop rdx, nested NMIs will corrupt it */
+end_repeat_nmi:
 	movq 11*8(%rsp), %rdx
 	/*
 	 * Everything below this point can be preempted by a nested
-	 * NMI if the first NMI took an exception. Repeated NMIs
+	 * NMI if the first NMI took an exception and reset our iret stack
-	 * caused by an exception and nested NMI will start here, and
+	 * so that we repeat another NMI.
 	 * can still be preempted by another NMI.
 	 */
 restart_nmi:
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@ -1675,26 +1700,6 @@ nmi_restore:
 	CFI_ENDPROC
 END(nmi)
 	/*
 	 * If an NMI hit an iret because of an exception or breakpoint,
 	 * it can lose its NMI context, and a nested NMI may come in.
 	 * In that case, the nested NMI will change the preempted NMI's
 	 * stack to jump to here when it does the final iret.
 	 */
 repeat_nmi:
 	INTR_FRAME
 	/* Update the stack variable to say we are still in NMI */
 	movq $1, 5*8(%rsp)
 	/* copy the saved stack back to copy stack */
 	.rept 5
 	pushq_cfi 4*8(%rsp)
 	.endr
 	jmp restart_nmi
 	CFI_ENDPROC
 end_repeat_nmi:
 ENTRY(ignore_sysret)
 	CFI_STARTPROC
 	mov $-ENOSYS,%eax
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/export.h>
 #include <asm/probe_roms.h>
 #include <asm/pci-direct.h>
 #include <asm/e820.h>
 #include <asm/mmzone.h>
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@ -20,14 +20,12 @@ ENDPROC(copy_page_c)
 ENTRY(copy_page)
 	CFI_STARTPROC
-	subq	$3*8,%rsp
+	subq	$2*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 3*8
+	CFI_ADJUST_CFA_OFFSET 2*8
 	movq	%rbx,(%rsp)
 	CFI_REL_OFFSET rbx, 0
 	movq	%r12,1*8(%rsp)
 	CFI_REL_OFFSET r12, 1*8
 	movq	%r13,2*8(%rsp)
 	CFI_REL_OFFSET r13, 2*8
 	movl	$(4096/64)-5,%ecx
 	.p2align 4
@ -91,10 +89,8 @@ ENTRY(copy_page)
 	CFI_RESTORE rbx
 	movq	1*8(%rsp),%r12
 	CFI_RESTORE r12
-	movq	2*8(%rsp),%r13
+	addq	$2*8,%rsp
-	CFI_RESTORE r13
+	CFI_ADJUST_CFA_OFFSET -2*8
 	addq	$3*8,%rsp
 	CFI_ADJUST_CFA_OFFSET -3*8
 	ret
 .Lcopy_page_end:
 	CFI_ENDPROC
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@ -27,9 +27,8 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c:
 	movq %rdi, %rax
-
+	movq %rdx, %rcx
-	movl %edx, %ecx
+	shrq $3, %rcx
 	shrl $3, %ecx
 	andl $7, %edx
 	rep movsq
 	movl %edx, %ecx
@ -48,8 +47,7 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c_e:
 	movq %rdi, %rax
-
+	movq %rdx, %rcx
 	movl %edx, %ecx
 	rep movsb
 	ret
 .Lmemcpy_e_e:
@ -60,10 +58,7 @@ ENTRY(memcpy)
 	CFI_STARTPROC
 	movq %rdi, %rax
-	/*
+	cmpq $0x20, %rdx
 	 * Use 32bit CMP here to avoid long NOP padding.
 	 */
 	cmp  $0x20, %edx
 	jb .Lhandle_tail
 	/*
@ -72,7 +67,7 @@ ENTRY(memcpy)
 	 */
 	cmp  %dil, %sil
 	jl .Lcopy_backward
-	subl $0x20, %edx
+	subq $0x20, %rdx
 .Lcopy_forward_loop:
 	subq $0x20,	%rdx
@ -91,7 +86,7 @@ ENTRY(memcpy)
 	movq %r11,	3*8(%rdi)
 	leaq 4*8(%rdi),	%rdi
 	jae  .Lcopy_forward_loop
-	addq $0x20,	%rdx
+	addl $0x20,	%edx
 	jmp  .Lhandle_tail
 .Lcopy_backward:
@ -123,11 +118,11 @@ ENTRY(memcpy)
 	/*
 	 * Calculate copy position to head.
 	 */
-	addq $0x20,	%rdx
+	addl $0x20,	%edx
 	subq %rdx,	%rsi
 	subq %rdx,	%rdi
 .Lhandle_tail:
-	cmpq $16,	%rdx
+	cmpl $16,	%edx
 	jb   .Lless_16bytes
 	/*
@ -144,7 +139,7 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_16bytes:
-	cmpq $8,	%rdx
+	cmpl $8,	%edx
 	jb   .Lless_8bytes
 	/*
 	 * Move data from 8 bytes to 15 bytes.
@ -156,7 +151,7 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_8bytes:
-	cmpq $4,	%rdx
+	cmpl $4,	%edx
 	jb   .Lless_3bytes
 	/*
@ -169,18 +164,19 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_3bytes:
-	cmpl $0, %edx
+	subl $1, %edx
-	je .Lend
+	jb .Lend
 	/*
 	 * Move data from 1 bytes to 3 bytes.
 	 */
-.Lloop_1:
+	movzbl (%rsi), %ecx
-	movb (%rsi), %r8b
+	jz .Lstore_1byte
-	movb %r8b, (%rdi)
+	movzbq 1(%rsi), %r8
-	incq %rdi
+	movzbq (%rsi, %rdx), %r9
-	incq %rsi
+	movb %r8b, 1(%rdi)
-	decl %edx
+	movb %r9b, (%rdi, %rdx)
-	jnz .Lloop_1
+.Lstore_1byte:
 	movb %cl, (%rdi)
 .Lend:
 	retq
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@ -19,16 +19,15 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemset_c:
 	movq %rdi,%r9
-	movl %edx,%r8d
+	movq %rdx,%rcx
-	andl $7,%r8d
+	andl $7,%edx
-	movl %edx,%ecx
+	shrq $3,%rcx
 	shrl $3,%ecx
 	/* expand byte value  */
 	movzbl %sil,%esi
 	movabs $0x0101010101010101,%rax
-	mulq %rsi		/* with rax, clobbers rdx */
+	imulq %rsi,%rax
 	rep stosq
-	movl %r8d,%ecx
+	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	ret
@ -50,7 +49,7 @@
 .Lmemset_c_e:
 	movq %rdi,%r9
 	movb %sil,%al
-	movl %edx,%ecx
+	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
@ -61,12 +60,11 @@ ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
 	movq %rdi,%r10
 	movq %rdx,%r11
 	/* expand byte value  */
 	movzbl %sil,%ecx
 	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
+	imulq  %rcx,%rax
 	/* align dst */
 	movl  %edi,%r9d
@ -75,13 +73,13 @@ ENTRY(__memset)
 	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
-	movl %r11d,%ecx
+	movq  %rdx,%rcx
-	shrl $6,%ecx
+	shrq  $6,%rcx
 	jz	 .Lhandle_tail
 	.p2align 4
 .Lloop_64:
-	decl   %ecx
+	decq  %rcx
 	movq  %rax,(%rdi)
 	movq  %rax,8(%rdi)
 	movq  %rax,16(%rdi)
@ -97,7 +95,7 @@ ENTRY(__memset)
 	   to predict jump tables. */
 	.p2align 4
 .Lhandle_tail:
-	movl	%r11d,%ecx
+	movl	%edx,%ecx
 	andl    $63&(~7),%ecx
 	jz 		.Lhandle_7
 	shrl	$3,%ecx
@ -109,12 +107,11 @@ ENTRY(__memset)
 	jnz    .Lloop_8
 .Lhandle_7:
-	movl	%r11d,%ecx
+	andl	$7,%edx
 	andl	$7,%ecx
 	jz      .Lende
 	.p2align 4
 .Lloop_1:
-	decl    %ecx
+	decl    %edx
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
 	jnz     .Lloop_1
@ -125,13 +122,13 @@ ENTRY(__memset)
 	CFI_RESTORE_STATE
 .Lbad_alignment:
-	cmpq $7,%r11
+	cmpq $7,%rdx
 	jbe	.Lhandle_7
 	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8
 	subq %r9,%r8
 	addq %r8,%rdi
-	subq %r8,%r11
+	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC
--- a/include/asm-generic/getorder.h
+++ b/include/asm-generic/getorder.h
@ -4,21 +4,58 @@
 #ifndef __ASSEMBLY__
 #include <linux/compiler.h>
 #include <linux/log2.h>
-/* Pure 2^n version of get_order */
+/*
-static inline __attribute_const__ int get_order(unsigned long size)
+ * Runtime evaluation of get_order()
 */
 static inline __attribute_const__
 int __get_order(unsigned long size)
 {
 	int order;
-	size = (size - 1) >> (PAGE_SHIFT - 1);
+	size--;
-	order = -1;
+	size >>= PAGE_SHIFT;
-	do {
+#if BITS_PER_LONG == 32
-		size >>= 1;
+	order = fls(size);
-		order++;
+#else
-	} while (size);
+	order = fls64(size);
 #endif
 	return order;
 }
 /**
 * get_order - Determine the allocation order of a memory size
 * @size: The size for which to get the order
 *
 * Determine the allocation order of a particular sized block of memory.  This
 * is on a logarithmic scale, where:
 *
 *	0 -> 2^0 * PAGE_SIZE and below
 *	1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
 *	2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
 *	3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
 *	4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
 *	...
 *
 * The order returned is used to find the smallest allocation granule required
 * to hold an object of the specified size.
 *
 * The result is undefined if the size is 0.
 *
 * This function may be used to initialise variables with compile time
 * evaluations of constants.
 */
 #define get_order(n)						\
 (								\
 	__builtin_constant_p(n) ? (				\
 		((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
 		(((n) < (1UL << PAGE_SHIFT)) ? 0 :		\
 		 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
 	) :							\
 	__get_order(n)						\
 )
 #endif	/* __ASSEMBLY__ */
 #endif	/* __ASM_GENERIC_GETORDER_H */