diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 28895c03cb1..506f152c238 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -877,6 +877,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) static int __init init_amd(struct cpuinfo_x86 *c) { int r; + unsigned level; #ifdef CONFIG_SMP unsigned long value; @@ -899,6 +900,11 @@ static int __init init_amd(struct cpuinfo_x86 *c) 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_bit(0*32+31, &c->x86_capability); + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + r = get_model_name(c); if (!r) { switch (c->x86) { diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 43d9fa13618..1f81b79b796 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S @@ -5,8 +5,46 @@ .globl clear_page .p2align 4 clear_page: + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret +clear_page_end: + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstructions,"a" + .align 8 + .quad clear_page + .quad clear_page_c + .byte X86_FEATURE_REP_GOOD + .byte clear_page_end-clear_page + .byte clear_page_c_end-clear_page_c + .previous + + .section .altinstr_replacement,"ax" +clear_page_c: movl $4096/8,%ecx xorl %eax,%eax rep stosq ret +clear_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 621a1976940..8fa19d96a7e 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S @@ -8,7 +8,94 @@ .globl copy_page .p2align 4 copy_page: + subq $3*8,%rsp + movq %rbx,(%rsp) + movq %r12,1*8(%rsp) + movq %r13,2*8(%rsp) + + movl $(4096/64)-5,%ecx + .p2align 4 +.Loop64: + dec %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .Loop64 + + movl $5,%ecx + .p2align 4 +.Loop2: + decl %ecx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Loop2 + + movq (%rsp),%rbx + movq 1*8(%rsp),%r12 + movq 2*8(%rsp),%r13 + addq $3*8,%rsp + ret + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstructions,"a" + .align 8 + .quad copy_page + .quad copy_page_c + .byte X86_FEATURE_REP_GOOD + .byte copy_page_c_end-copy_page_c + .byte copy_page_c_end-copy_page_c + .previous + + .section .altinstr_replacement,"ax" +copy_page_c: movl $4096/8,%ecx rep movsq ret +copy_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index 79422b6559c..f64569b83b5 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -4,9 +4,12 @@ * Functions to copy from and to user space. */ +#define FIX_ALIGNMENT 1 + #include #include #include + #include /* Standard copy_to_user with segment limit checking */ .globl copy_to_user @@ -18,7 +21,23 @@ copy_to_user: jc bad_to_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_to_user - jmp copy_user_generic +2: + .byte 0xe9 /* 32bit jump */ + .long .Lcug-1f +1: + + .section .altinstr_replacement,"ax" +3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ + .long copy_user_generic_c-1b /* offset */ + .previous + .section .altinstructions,"a" + .align 8 + .quad 2b + .quad 3b + .byte X86_FEATURE_REP_GOOD + .byte 5 + .byte 5 + .previous /* Standard copy_from_user with segment limit checking */ .globl copy_from_user @@ -53,44 +72,230 @@ bad_to_user: * rsi source * rdx count * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - * * Output: * eax uncopied bytes or 0 if successful. */ - .globl copy_user_generic + .p2align 4 copy_user_generic: + .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ + .byte 0x66,0x90 +1: + .section .altinstr_replacement,"ax" +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long copy_user_generic_c-1b /* offset */ + .previous + .section .altinstructions,"a" + .align 8 + .quad copy_user_generic + .quad 2b + .byte X86_FEATURE_REP_GOOD + .byte 5 + .byte 5 + .previous +.Lcug: + pushq %rbx + xorl %eax,%eax /*zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movq %r11,(%rdi) +.Ld2: movq %r8,1*8(%rdi) +.Ld3: movq %r9,2*8(%rdi) +.Ld4: movq %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movq %r11,4*8(%rdi) +.Ld6: movq %r8,5*8(%rdi) +.Ld7: movq %r9,6*8(%rdi) +.Ld8: movq %r10,7*8(%rdi) + + decq %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movq %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + ret + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + + /* Some CPUs run faster using the string copy instructions. + This is also a lot simpler. Use them when possible. + Patch in jmps to this code instead of copying it fully + to avoid unwanted aliasing in the exception tables. */ + + /* rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successfull. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + */ +copy_user_generic_c: movl %edx,%ecx shrl $3,%ecx andl $7,%edx - jz 5f 1: rep movsq movl %edx,%ecx - xor %eax,%eax 2: rep movsb - ret - /* align here? */ -5: xorl %eax,%eax -6: rep movsq - ret - - .section .fixup,"ax" -3: lea (%rdx,%rcx,8),%rax - ret 4: movl %ecx,%eax ret - .previous +3: lea (%rdx,%rcx,8),%rax + ret .section __ex_table,"a" .quad 1b,3b .quad 2b,4b - .quad 6b,4b .previous diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 92dd8054460..5554948b555 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -11,8 +11,6 @@ * * Output: * rax original destination - * - * TODO: check best memcpy for PSC */ .globl __memcpy @@ -20,6 +18,95 @@ .p2align 4 __memcpy: memcpy: + pushq %rbx + movq %rdi,%rax + + movl %edx,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + + movq (%rsi),%r11 + movq 8(%rsi),%r8 + + movq %r11,(%rdi) + movq %r8,1*8(%rdi) + + movq 2*8(%rsi),%r9 + movq 3*8(%rsi),%r10 + + movq %r9,2*8(%rdi) + movq %r10,3*8(%rdi) + + movq 4*8(%rsi),%r11 + movq 5*8(%rsi),%r8 + + movq %r11,4*8(%rdi) + movq %r8,5*8(%rdi) + + movq 6*8(%rsi),%r9 + movq 7*8(%rsi),%r10 + + movq %r9,6*8(%rdi) + movq %r10,7*8(%rdi) + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jnz .Lloop_64 + +.Lhandle_tail: + movl %edx,%ecx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + .p2align 4 +.Lloop_8: + decl %ecx + movq (%rsi),%r8 + movq %r8,(%rdi) + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + movb (%rsi),%r8b + movb %r8b,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + ret +.Lfinal: + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad memcpy_c + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal-memcpy + .byte memcpy_c_end-memcpy_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi source + * rdx count + */ +memcpy_c: movq %rdi,%rax movl %edx,%ecx shrl $3,%ecx @@ -30,3 +117,5 @@ memcpy: rep movsb ret +memcpy_c_end: + .previous diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f24ed1..ad397f2c7de 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -13,6 +13,98 @@ .p2align 4 memset: __memset: + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_REP_GOOD + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,3 +121,5 @@ __memset: stosb movq %r9,%rax ret +memset_c_end: + .previous diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h index 41c0ac8559b..76bb6193ae9 100644 --- a/include/asm-x86_64/cpufeature.h +++ b/include/asm-x86_64/cpufeature.h @@ -61,7 +61,7 @@ #define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ -/* 4 free */ +#define X86_FEATURE_REP_GOOD (3*32+ 4) /* rep microcode works well on this CPU */ #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */ #define X86_FEATURE_SYNC_RDTSC (3*32+6) /* RDTSC syncs CPU core */