arm64: lib: improve copy_page to deal with 128 bytes at a time
We want to avoid lots of different copy_page implementations, settling for something that is "good enough" everywhere and hopefully easy to understand and maintain whilst we're at it. This patch reworks our copy_page implementation based on discussions with Cavium on the list and benchmarking on Cortex-A processors so that: - The loop is unrolled to copy 128 bytes per iteration - The reads are offset so that we read from the next 128-byte block in the same iteration that we store the previous block - Explicit prefetch instructions are removed for now, since they hurt performance on CPUs with hardware prefetching - The loop exit condition is calculated at the start of the loop Signed-off-by: Will Deacon <will.deacon@arm.com> Tested-by: Andrew Pinski <apinski@cavium.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
This commit is contained in:
Родитель
d5370f7548
Коммит
223e23e8aa
|
@ -27,20 +27,50 @@
|
|||
* x1 - src
|
||||
*/
|
||||
ENTRY(copy_page)
|
||||
/* Assume cache line size is 64 bytes. */
|
||||
prfm pldl1strm, [x1, #64]
|
||||
1: ldp x2, x3, [x1]
|
||||
ldp x2, x3, [x1]
|
||||
ldp x4, x5, [x1, #16]
|
||||
ldp x6, x7, [x1, #32]
|
||||
ldp x8, x9, [x1, #48]
|
||||
add x1, x1, #64
|
||||
prfm pldl1strm, [x1, #64]
|
||||
ldp x10, x11, [x1, #64]
|
||||
ldp x12, x13, [x1, #80]
|
||||
ldp x14, x15, [x1, #96]
|
||||
ldp x16, x17, [x1, #112]
|
||||
|
||||
mov x18, #(PAGE_SIZE - 128)
|
||||
add x1, x1, #128
|
||||
1:
|
||||
subs x18, x18, #128
|
||||
|
||||
stnp x2, x3, [x0]
|
||||
ldp x2, x3, [x1]
|
||||
stnp x4, x5, [x0, #16]
|
||||
ldp x4, x5, [x1, #16]
|
||||
stnp x6, x7, [x0, #32]
|
||||
ldp x6, x7, [x1, #32]
|
||||
stnp x8, x9, [x0, #48]
|
||||
ldp x8, x9, [x1, #48]
|
||||
stnp x10, x11, [x0, #64]
|
||||
ldp x10, x11, [x1, #64]
|
||||
stnp x12, x13, [x0, #80]
|
||||
ldp x12, x13, [x1, #80]
|
||||
stnp x14, x15, [x0, #96]
|
||||
ldp x14, x15, [x1, #96]
|
||||
stnp x16, x17, [x0, #112]
|
||||
ldp x16, x17, [x1, #112]
|
||||
|
||||
add x0, x0, #128
|
||||
add x1, x1, #128
|
||||
|
||||
b.gt 1b
|
||||
|
||||
stnp x2, x3, [x0]
|
||||
stnp x4, x5, [x0, #16]
|
||||
stnp x6, x7, [x0, #32]
|
||||
stnp x8, x9, [x0, #48]
|
||||
add x0, x0, #64
|
||||
tst x1, #(PAGE_SIZE - 1)
|
||||
b.ne 1b
|
||||
stnp x10, x11, [x0, #64]
|
||||
stnp x12, x13, [x0, #80]
|
||||
stnp x14, x15, [x0, #96]
|
||||
stnp x16, x17, [x0, #112]
|
||||
|
||||
ret
|
||||
ENDPROC(copy_page)
|
||||
|
|
Загрузка…
Ссылка в новой задаче