x86-64, mem: Convert memmove() to assembly file and fix return value bug

memmove_64.c only implements memmove() function which is completely written in
inline assembly code. Therefore it doesn't make sense to keep the assembly code
in .c file.

Currently memmove() doesn't store return value to rax. This may cause issue if
caller uses the return value. The patch fixes this issue.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
This commit is contained in:
Fenghua Yu 2011-01-17 17:39:15 -08:00 коммит произвёл H. Peter Anvin
Родитель 1bae4ce27c
Коммит 9599ec0471
3 изменённых файлов: 198 добавлений и 192 удалений

Просмотреть файл

@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(empty_zero_page);
#ifndef CONFIG_PARAVIRT

197
arch/x86/lib/memmove_64.S Normal file
Просмотреть файл

@ -0,0 +1,197 @@
/*
* Normally compiler builtins are used, but sometimes the compiler calls out
* of line code. Based on asm-i386/string.h.
*
* This assembly file is re-written from memmove_64.c file.
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/
#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#undef memmove
/*
* Implement memmove(). This can handle overlap between src and dst.
*
* Input:
* rdi: dest
* rsi: src
* rdx: count
*
* Output:
* rax: dest
*/
ENTRY(memmove)
CFI_STARTPROC
/* Handle more 32bytes in loop */
mov %rdi, %rax
cmp $0x20, %rdx
jb 1f
/* Decide forward/backward copy mode */
cmp %rdi, %rsi
jb 2f
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
*/
cmp $680, %rdx
jb 3f
/*
* movsq instruction is only good for aligned case.
*/
cmpb %dil, %sil
je 4f
3:
sub $0x20, %rdx
/*
* We gobble 32byts forward in each loop.
*/
5:
sub $0x20, %rdx
movq 0*8(%rsi), %r11
movq 1*8(%rsi), %r10
movq 2*8(%rsi), %r9
movq 3*8(%rsi), %r8
leaq 4*8(%rsi), %rsi
movq %r11, 0*8(%rdi)
movq %r10, 1*8(%rdi)
movq %r9, 2*8(%rdi)
movq %r8, 3*8(%rdi)
leaq 4*8(%rdi), %rdi
jae 5b
addq $0x20, %rdx
jmp 1f
/*
* Handle data forward by movsq.
*/
.p2align 4
4:
movq %rdx, %rcx
movq -8(%rsi, %rdx), %r11
lea -8(%rdi, %rdx), %r10
shrq $3, %rcx
rep movsq
movq %r11, (%r10)
jmp 13f
/*
* Handle data backward by movsq.
*/
.p2align 4
7:
movq %rdx, %rcx
movq (%rsi), %r11
movq %rdi, %r10
leaq -8(%rsi, %rdx), %rsi
leaq -8(%rdi, %rdx), %rdi
shrq $3, %rcx
std
rep movsq
cld
movq %r11, (%r10)
jmp 13f
/*
* Start to prepare for backward copy.
*/
.p2align 4
2:
cmp $680, %rdx
jb 6f
cmp %dil, %sil
je 7b
6:
/*
* Calculate copy position to tail.
*/
addq %rdx, %rsi
addq %rdx, %rdi
subq $0x20, %rdx
/*
* We gobble 32byts backward in each loop.
*/
8:
subq $0x20, %rdx
movq -1*8(%rsi), %r11
movq -2*8(%rsi), %r10
movq -3*8(%rsi), %r9
movq -4*8(%rsi), %r8
leaq -4*8(%rsi), %rsi
movq %r11, -1*8(%rdi)
movq %r10, -2*8(%rdi)
movq %r9, -3*8(%rdi)
movq %r8, -4*8(%rdi)
leaq -4*8(%rdi), %rdi
jae 8b
/*
* Calculate copy position to head.
*/
addq $0x20, %rdx
subq %rdx, %rsi
subq %rdx, %rdi
1:
cmpq $16, %rdx
jb 9f
/*
* Move data from 16 bytes to 31 bytes.
*/
movq 0*8(%rsi), %r11
movq 1*8(%rsi), %r10
movq -2*8(%rsi, %rdx), %r9
movq -1*8(%rsi, %rdx), %r8
movq %r11, 0*8(%rdi)
movq %r10, 1*8(%rdi)
movq %r9, -2*8(%rdi, %rdx)
movq %r8, -1*8(%rdi, %rdx)
jmp 13f
.p2align 4
9:
cmpq $8, %rdx
jb 10f
/*
* Move data from 8 bytes to 15 bytes.
*/
movq 0*8(%rsi), %r11
movq -1*8(%rsi, %rdx), %r10
movq %r11, 0*8(%rdi)
movq %r10, -1*8(%rdi, %rdx)
jmp 13f
10:
cmpq $4, %rdx
jb 11f
/*
* Move data from 4 bytes to 7 bytes.
*/
movl (%rsi), %r11d
movl -4(%rsi, %rdx), %r10d
movl %r11d, (%rdi)
movl %r10d, -4(%rdi, %rdx)
jmp 13f
11:
cmp $2, %rdx
jb 12f
/*
* Move data from 2 bytes to 3 bytes.
*/
movw (%rsi), %r11w
movw -2(%rsi, %rdx), %r10w
movw %r11w, (%rdi)
movw %r10w, -2(%rdi, %rdx)
jmp 13f
12:
cmp $1, %rdx
jb 13f
/*
* Move data for 1 byte.
*/
movb (%rsi), %r11b
movb %r11b, (%rdi)
13:
retq
CFI_ENDPROC
ENDPROC(memmove)

Просмотреть файл

@ -1,192 +0,0 @@
/* Normally compiler builtins are used, but sometimes the compiler calls out
of line code. Based on asm-i386/string.h.
*/
#define _STRING_C
#include <linux/string.h>
#include <linux/module.h>
#undef memmove
void *memmove(void *dest, const void *src, size_t count)
{
unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
char *ret;
__asm__ __volatile__(
/* Handle more 32bytes in loop */
"mov %2, %3\n\t"
"cmp $0x20, %0\n\t"
"jb 1f\n\t"
/* Decide forward/backward copy mode */
"cmp %2, %1\n\t"
"jb 2f\n\t"
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
*/
"cmp $680, %0\n\t"
"jb 3f\n\t"
/*
* movsq instruction is only good for aligned case.
*/
"cmpb %%dil, %%sil\n\t"
"je 4f\n\t"
"3:\n\t"
"sub $0x20, %0\n\t"
/*
* We gobble 32byts forward in each loop.
*/
"5:\n\t"
"sub $0x20, %0\n\t"
"movq 0*8(%1), %4\n\t"
"movq 1*8(%1), %5\n\t"
"movq 2*8(%1), %6\n\t"
"movq 3*8(%1), %7\n\t"
"leaq 4*8(%1), %1\n\t"
"movq %4, 0*8(%2)\n\t"
"movq %5, 1*8(%2)\n\t"
"movq %6, 2*8(%2)\n\t"
"movq %7, 3*8(%2)\n\t"
"leaq 4*8(%2), %2\n\t"
"jae 5b\n\t"
"addq $0x20, %0\n\t"
"jmp 1f\n\t"
/*
* Handle data forward by movsq.
*/
".p2align 4\n\t"
"4:\n\t"
"movq %0, %8\n\t"
"movq -8(%1, %0), %4\n\t"
"lea -8(%2, %0), %5\n\t"
"shrq $3, %8\n\t"
"rep movsq\n\t"
"movq %4, (%5)\n\t"
"jmp 13f\n\t"
/*
* Handle data backward by movsq.
*/
".p2align 4\n\t"
"7:\n\t"
"movq %0, %8\n\t"
"movq (%1), %4\n\t"
"movq %2, %5\n\t"
"leaq -8(%1, %0), %1\n\t"
"leaq -8(%2, %0), %2\n\t"
"shrq $3, %8\n\t"
"std\n\t"
"rep movsq\n\t"
"cld\n\t"
"movq %4, (%5)\n\t"
"jmp 13f\n\t"
/*
* Start to prepare for backward copy.
*/
".p2align 4\n\t"
"2:\n\t"
"cmp $680, %0\n\t"
"jb 6f \n\t"
"cmp %%dil, %%sil\n\t"
"je 7b \n\t"
"6:\n\t"
/*
* Calculate copy position to tail.
*/
"addq %0, %1\n\t"
"addq %0, %2\n\t"
"subq $0x20, %0\n\t"
/*
* We gobble 32byts backward in each loop.
*/
"8:\n\t"
"subq $0x20, %0\n\t"
"movq -1*8(%1), %4\n\t"
"movq -2*8(%1), %5\n\t"
"movq -3*8(%1), %6\n\t"
"movq -4*8(%1), %7\n\t"
"leaq -4*8(%1), %1\n\t"
"movq %4, -1*8(%2)\n\t"
"movq %5, -2*8(%2)\n\t"
"movq %6, -3*8(%2)\n\t"
"movq %7, -4*8(%2)\n\t"
"leaq -4*8(%2), %2\n\t"
"jae 8b\n\t"
/*
* Calculate copy position to head.
*/
"addq $0x20, %0\n\t"
"subq %0, %1\n\t"
"subq %0, %2\n\t"
"1:\n\t"
"cmpq $16, %0\n\t"
"jb 9f\n\t"
/*
* Move data from 16 bytes to 31 bytes.
*/
"movq 0*8(%1), %4\n\t"
"movq 1*8(%1), %5\n\t"
"movq -2*8(%1, %0), %6\n\t"
"movq -1*8(%1, %0), %7\n\t"
"movq %4, 0*8(%2)\n\t"
"movq %5, 1*8(%2)\n\t"
"movq %6, -2*8(%2, %0)\n\t"
"movq %7, -1*8(%2, %0)\n\t"
"jmp 13f\n\t"
".p2align 4\n\t"
"9:\n\t"
"cmpq $8, %0\n\t"
"jb 10f\n\t"
/*
* Move data from 8 bytes to 15 bytes.
*/
"movq 0*8(%1), %4\n\t"
"movq -1*8(%1, %0), %5\n\t"
"movq %4, 0*8(%2)\n\t"
"movq %5, -1*8(%2, %0)\n\t"
"jmp 13f\n\t"
"10:\n\t"
"cmpq $4, %0\n\t"
"jb 11f\n\t"
/*
* Move data from 4 bytes to 7 bytes.
*/
"movl (%1), %4d\n\t"
"movl -4(%1, %0), %5d\n\t"
"movl %4d, (%2)\n\t"
"movl %5d, -4(%2, %0)\n\t"
"jmp 13f\n\t"
"11:\n\t"
"cmp $2, %0\n\t"
"jb 12f\n\t"
/*
* Move data from 2 bytes to 3 bytes.
*/
"movw (%1), %4w\n\t"
"movw -2(%1, %0), %5w\n\t"
"movw %4w, (%2)\n\t"
"movw %5w, -2(%2, %0)\n\t"
"jmp 13f\n\t"
"12:\n\t"
"cmp $1, %0\n\t"
"jb 13f\n\t"
/*
* Move data for 1 byte.
*/
"movb (%1), %4b\n\t"
"movb %4b, (%2)\n\t"
"13:\n\t"
: "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
"=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
:"0" (count),
"1" (src),
"2" (dest)
:"memory");
return ret;
}
EXPORT_SYMBOL(memmove);