arm64: lib: Implement optimized memmove routine

This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memmove() function. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-28 06:11:30 +01:00 · 2014-04-28 06:11:30 +01:00 · 280adc1951
--- a/arch/arm64/lib/memmove.S
+++ b/arch/arm64/lib/memmove.S
@ -1,5 +1,13 @@
 /*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * and re-licensed under GPLv2 for the Linux kernel. The original code can
 * be found @
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@ -16,6 +24,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 /*
 * Move a buffer from src to test (alignment handled by the hardware).
@ -28,30 +37,161 @@
 * Returns:
 *	x0 - dest
 */
 dstin	.req	x0
 src	.req	x1
 count	.req	x2
 tmp1	.req	x3
 tmp1w	.req	w3
 tmp2	.req	x4
 tmp2w	.req	w4
 tmp3	.req	x5
 tmp3w	.req	w5
 dst	.req	x6
 A_l	.req	x7
 A_h	.req	x8
 B_l	.req	x9
 B_h	.req	x10
 C_l	.req	x11
 C_h	.req	x12
 D_l	.req	x13
 D_h	.req	x14
 ENTRY(memmove)
-	cmp	x0, x1
+	cmp	dstin, src
-	b.ls	memcpy
+	b.lo	memcpy
-	add	x4, x0, x2
+	add	tmp1, src, count
-	add	x1, x1, x2
+	cmp	dstin, tmp1
-	subs	x2, x2, #8
+	b.hs	memcpy		/* No overlap.  */
-	b.mi	2f
+
-1:	ldr	x3, [x1, #-8]!
+	add	dst, dstin, count
-	subs	x2, x2, #8
+	add	src, src, count
-	str	x3, [x4, #-8]!
+	cmp	count, #16
-	b.pl	1b
+	b.lo	.Ltail15  /*probably non-alignment accesses.*/
-2:	adds	x2, x2, #4
+
-	b.mi	3f
+	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
-	ldr	w3, [x1, #-4]!
+	b.eq	.LSrcAligned
-	sub	x2, x2, #4
+	sub	count, count, tmp2
-	str	w3, [x4, #-4]!
+	/*
-3:	adds	x2, x2, #2
+	* process the aligned offset length to make the src aligned firstly.
-	b.mi	4f
+	* those extra instructions' cost is acceptable. It also make the
-	ldrh	w3, [x1, #-2]!
+	* coming accesses are based on aligned address.
-	sub	x2, x2, #2
+	*/
-	strh	w3, [x4, #-2]!
+	tbz	tmp2, #0, 1f
-4:	adds	x2, x2, #1
+	ldrb	tmp1w, [src, #-1]!
-	b.mi	5f
+	strb	tmp1w, [dst, #-1]!
-	ldrb	w3, [x1, #-1]
+1:
-	strb	w3, [x4, #-1]
+	tbz	tmp2, #1, 2f
-5:	ret
+	ldrh	tmp1w, [src, #-2]!
 	strh	tmp1w, [dst, #-2]!
 2:
 	tbz	tmp2, #2, 3f
 	ldr	tmp1w, [src, #-4]!
 	str	tmp1w, [dst, #-4]!
 3:
 	tbz	tmp2, #3, .LSrcAligned
 	ldr	tmp1, [src, #-8]!
 	str	tmp1, [dst, #-8]!
 .LSrcAligned:
 	cmp	count, #64
 	b.ge	.Lcpy_over64
 	/*
 	* Deal with small copies quickly by dropping straight into the
 	* exit block.
 	*/
 .Ltail63:
 	/*
 	* Copy up to 48 bytes of data. At this point we only need the
 	* bottom 6 bits of count to be accurate.
 	*/
 	ands	tmp1, count, #0x30
 	b.eq	.Ltail15
 	cmp	tmp1w, #0x20
 	b.eq	1f
 	b.lt	2f
 	ldp	A_l, A_h, [src, #-16]!
 	stp	A_l, A_h, [dst, #-16]!
 1:
 	ldp	A_l, A_h, [src, #-16]!
 	stp	A_l, A_h, [dst, #-16]!
 2:
 	ldp	A_l, A_h, [src, #-16]!
 	stp	A_l, A_h, [dst, #-16]!
 .Ltail15:
 	tbz	count, #3, 1f
 	ldr	tmp1, [src, #-8]!
 	str	tmp1, [dst, #-8]!
 1:
 	tbz	count, #2, 2f
 	ldr	tmp1w, [src, #-4]!
 	str	tmp1w, [dst, #-4]!
 2:
 	tbz	count, #1, 3f
 	ldrh	tmp1w, [src, #-2]!
 	strh	tmp1w, [dst, #-2]!
 3:
 	tbz	count, #0, .Lexitfunc
 	ldrb	tmp1w, [src, #-1]
 	strb	tmp1w, [dst, #-1]
 .Lexitfunc:
 	ret
 .Lcpy_over64:
 	subs	count, count, #128
 	b.ge	.Lcpy_body_large
 	/*
 	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
 	* to the tail.
 	*/
 	ldp	A_l, A_h, [src, #-16]
 	stp	A_l, A_h, [dst, #-16]
 	ldp	B_l, B_h, [src, #-32]
 	ldp	C_l, C_h, [src, #-48]
 	stp	B_l, B_h, [dst, #-32]
 	stp	C_l, C_h, [dst, #-48]
 	ldp	D_l, D_h, [src, #-64]!
 	stp	D_l, D_h, [dst, #-64]!
 	tst	count, #0x3f
 	b.ne	.Ltail63
 	ret
 	/*
 	* Critical loop. Start at a new cache line boundary. Assuming
 	* 64 bytes per line this ensures the entire loop is in one line.
 	*/
 	.p2align	L1_CACHE_SHIFT
 .Lcpy_body_large:
 	/* pre-load 64 bytes data. */
 	ldp	A_l, A_h, [src, #-16]
 	ldp	B_l, B_h, [src, #-32]
 	ldp	C_l, C_h, [src, #-48]
 	ldp	D_l, D_h, [src, #-64]!
 1:
 	/*
 	* interlace the load of next 64 bytes data block with store of the last
 	* loaded 64 bytes data.
 	*/
 	stp	A_l, A_h, [dst, #-16]
 	ldp	A_l, A_h, [src, #-16]
 	stp	B_l, B_h, [dst, #-32]
 	ldp	B_l, B_h, [src, #-32]
 	stp	C_l, C_h, [dst, #-48]
 	ldp	C_l, C_h, [src, #-48]
 	stp	D_l, D_h, [dst, #-64]!
 	ldp	D_l, D_h, [src, #-64]!
 	subs	count, count, #64
 	b.ge	1b
 	stp	A_l, A_h, [dst, #-16]
 	stp	B_l, B_h, [dst, #-32]
 	stp	C_l, C_h, [dst, #-48]
 	stp	D_l, D_h, [dst, #-64]!
 	tst	count, #0x3f
 	b.ne	.Ltail63
 	ret
 ENDPROC(memmove)