Partially revert "powerpc: Remove duplicate cacheable_memcpy/memzero functions"
This partially reverts commit 'powerpc: Remove duplicate cacheable_memcpy/memzero functions ("b05ae4ee602b7dc90771408ccf0972e1b3801a35")' Functions cacheable_memcpy/memzero are more efficient than memcpy/memset as they use the dcbz instruction which avoids refill of the cacheline with the data that we will overwrite. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <scottwood@freescale.com>
This commit is contained in:
Родитель
934628c7e6
Коммит
df087e450d
|
@ -69,6 +69,54 @@ CACHELINE_BYTES = L1_CACHE_BYTES
|
|||
LG_CACHELINE_BYTES = L1_CACHE_SHIFT
|
||||
CACHELINE_MASK = (L1_CACHE_BYTES-1)
|
||||
|
||||
/*
|
||||
* Use dcbz on the complete cache lines in the destination
|
||||
* to set them to zero. This requires that the destination
|
||||
* area is cacheable. -- paulus
|
||||
*/
|
||||
_GLOBAL(cacheable_memzero)
|
||||
mr r5,r4
|
||||
li r4,0
|
||||
addi r6,r3,-4
|
||||
cmplwi 0,r5,4
|
||||
blt 7f
|
||||
stwu r4,4(r6)
|
||||
beqlr
|
||||
andi. r0,r6,3
|
||||
add r5,r0,r5
|
||||
subf r6,r0,r6
|
||||
clrlwi r7,r6,32-LG_CACHELINE_BYTES
|
||||
add r8,r7,r5
|
||||
srwi r9,r8,LG_CACHELINE_BYTES
|
||||
addic. r9,r9,-1 /* total number of complete cachelines */
|
||||
ble 2f
|
||||
xori r0,r7,CACHELINE_MASK & ~3
|
||||
srwi. r0,r0,2
|
||||
beq 3f
|
||||
mtctr r0
|
||||
4: stwu r4,4(r6)
|
||||
bdnz 4b
|
||||
3: mtctr r9
|
||||
li r7,4
|
||||
10: dcbz r7,r6
|
||||
addi r6,r6,CACHELINE_BYTES
|
||||
bdnz 10b
|
||||
clrlwi r5,r8,32-LG_CACHELINE_BYTES
|
||||
addi r5,r5,4
|
||||
2: srwi r0,r5,2
|
||||
mtctr r0
|
||||
bdz 6f
|
||||
1: stwu r4,4(r6)
|
||||
bdnz 1b
|
||||
6: andi. r5,r5,3
|
||||
7: cmpwi 0,r5,0
|
||||
beqlr
|
||||
mtctr r5
|
||||
addi r6,r6,3
|
||||
8: stbu r4,1(r6)
|
||||
bdnz 8b
|
||||
blr
|
||||
|
||||
_GLOBAL(memset)
|
||||
rlwimi r4,r4,8,16,23
|
||||
rlwimi r4,r4,16,0,15
|
||||
|
@ -94,6 +142,85 @@ _GLOBAL(memset)
|
|||
bdnz 8b
|
||||
blr
|
||||
|
||||
/*
|
||||
* This version uses dcbz on the complete cache lines in the
|
||||
* destination area to reduce memory traffic. This requires that
|
||||
* the destination area is cacheable.
|
||||
* We only use this version if the source and dest don't overlap.
|
||||
* -- paulus.
|
||||
*/
|
||||
_GLOBAL(cacheable_memcpy)
|
||||
add r7,r3,r5 /* test if the src & dst overlap */
|
||||
add r8,r4,r5
|
||||
cmplw 0,r4,r7
|
||||
cmplw 1,r3,r8
|
||||
crand 0,0,4 /* cr0.lt &= cr1.lt */
|
||||
blt memcpy /* if regions overlap */
|
||||
|
||||
addi r4,r4,-4
|
||||
addi r6,r3,-4
|
||||
neg r0,r3
|
||||
andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
|
||||
beq 58f
|
||||
|
||||
cmplw 0,r5,r0 /* is this more than total to do? */
|
||||
blt 63f /* if not much to do */
|
||||
andi. r8,r0,3 /* get it word-aligned first */
|
||||
subf r5,r0,r5
|
||||
mtctr r8
|
||||
beq+ 61f
|
||||
70: lbz r9,4(r4) /* do some bytes */
|
||||
stb r9,4(r6)
|
||||
addi r4,r4,1
|
||||
addi r6,r6,1
|
||||
bdnz 70b
|
||||
61: srwi. r0,r0,2
|
||||
mtctr r0
|
||||
beq 58f
|
||||
72: lwzu r9,4(r4) /* do some words */
|
||||
stwu r9,4(r6)
|
||||
bdnz 72b
|
||||
|
||||
58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
|
||||
clrlwi r5,r5,32-LG_CACHELINE_BYTES
|
||||
li r11,4
|
||||
mtctr r0
|
||||
beq 63f
|
||||
53:
|
||||
dcbz r11,r6
|
||||
COPY_16_BYTES
|
||||
#if L1_CACHE_BYTES >= 32
|
||||
COPY_16_BYTES
|
||||
#if L1_CACHE_BYTES >= 64
|
||||
COPY_16_BYTES
|
||||
COPY_16_BYTES
|
||||
#if L1_CACHE_BYTES >= 128
|
||||
COPY_16_BYTES
|
||||
COPY_16_BYTES
|
||||
COPY_16_BYTES
|
||||
COPY_16_BYTES
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
bdnz 53b
|
||||
|
||||
63: srwi. r0,r5,2
|
||||
mtctr r0
|
||||
beq 64f
|
||||
30: lwzu r0,4(r4)
|
||||
stwu r0,4(r6)
|
||||
bdnz 30b
|
||||
|
||||
64: andi. r0,r5,3
|
||||
mtctr r0
|
||||
beq+ 65f
|
||||
40: lbz r0,4(r4)
|
||||
stb r0,4(r6)
|
||||
addi r4,r4,1
|
||||
addi r6,r6,1
|
||||
bdnz 40b
|
||||
65: blr
|
||||
|
||||
_GLOBAL(memmove)
|
||||
cmplw 0,r3,r4
|
||||
bgt backwards_memcpy
|
||||
|
|
Загрузка…
Ссылка в новой задаче