OMAP3: PM: Update clean_l2 to use v7_flush_dcache_all
Analysis in TI kernel with ETM showed that using cache mapped flush in kernel instead of SO mapped flush cost drops by 65% (3.39mS down to 1.17mS) for clean_l2 which is used during sleep sequences. Overall: - speed up - unfortunately there isn't a good alternative flush method today - code reduction and less maintenance and potential bug in unmaintained code This also fixes the bug with the clean_l2 function usage. Reported-by: Tony Lindgren <tony@atomide.com> Cc: Kevin Hilman <khilman@deeprootsystems.com> Cc: Tony Lindgren <tony@atomide.com> Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Acked-by: Jean Pihet <j-pihet@ti.com> [nm@ti.com: ported rkw's proposal to 2.6.37-rc2] Signed-off-by: Nishanth Menon <nm@ti.com> Signed-off-by: Richard Woodruff <r-woodruff2@ti.com> Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
This commit is contained in:
Родитель
1cbbe37ac5
Коммит
0bd4053536
|
@ -520,72 +520,18 @@ clean_caches:
|
|||
cmp r9, #1 /* Check whether L2 inval is required or not*/
|
||||
bne skip_l2_inval
|
||||
clean_l2:
|
||||
/* read clidr */
|
||||
mrc p15, 1, r0, c0, c0, 1
|
||||
/* extract loc from clidr */
|
||||
ands r3, r0, #0x7000000
|
||||
/* left align loc bit field */
|
||||
mov r3, r3, lsr #23
|
||||
/* if loc is 0, then no need to clean */
|
||||
beq finished
|
||||
/* start clean at cache level 0 */
|
||||
mov r10, #0
|
||||
loop1:
|
||||
/* work out 3x current cache level */
|
||||
add r2, r10, r10, lsr #1
|
||||
/* extract cache type bits from clidr*/
|
||||
mov r1, r0, lsr r2
|
||||
/* mask of the bits for current cache only */
|
||||
and r1, r1, #7
|
||||
/* see what cache we have at this level */
|
||||
cmp r1, #2
|
||||
/* skip if no cache, or just i-cache */
|
||||
blt skip
|
||||
/* select current cache level in cssr */
|
||||
mcr p15, 2, r10, c0, c0, 0
|
||||
/* isb to sych the new cssr&csidr */
|
||||
isb
|
||||
/* read the new csidr */
|
||||
mrc p15, 1, r1, c0, c0, 0
|
||||
/* extract the length of the cache lines */
|
||||
and r2, r1, #7
|
||||
/* add 4 (line length offset) */
|
||||
add r2, r2, #4
|
||||
ldr r4, assoc_mask
|
||||
/* find maximum number on the way size */
|
||||
ands r4, r4, r1, lsr #3
|
||||
/* find bit position of way size increment */
|
||||
clz r5, r4
|
||||
ldr r7, numset_mask
|
||||
/* extract max number of the index size*/
|
||||
ands r7, r7, r1, lsr #13
|
||||
loop2:
|
||||
mov r9, r4
|
||||
/* create working copy of max way size*/
|
||||
loop3:
|
||||
/* factor way and cache number into r11 */
|
||||
orr r11, r10, r9, lsl r5
|
||||
/* factor index number into r11 */
|
||||
orr r11, r11, r7, lsl r2
|
||||
/*clean & invalidate by set/way */
|
||||
mcr p15, 0, r11, c7, c10, 2
|
||||
/* decrement the way*/
|
||||
subs r9, r9, #1
|
||||
bge loop3
|
||||
/*decrement the index */
|
||||
subs r7, r7, #1
|
||||
bge loop2
|
||||
skip:
|
||||
add r10, r10, #2
|
||||
/* increment cache number */
|
||||
cmp r3, r10
|
||||
bgt loop1
|
||||
finished:
|
||||
/*swith back to cache level 0 */
|
||||
mov r10, #0
|
||||
/* select current cache level in cssr */
|
||||
mcr p15, 2, r10, c0, c0, 0
|
||||
isb
|
||||
/*
|
||||
* Jump out to kernel flush routine
|
||||
* - reuse that code is better
|
||||
* - it executes in a cached space so is faster than refetch per-block
|
||||
* - should be faster and will change with kernel
|
||||
* - 'might' have to copy address, load and jump to it
|
||||
* - lr is used since we are running in SRAM currently.
|
||||
*/
|
||||
ldr r1, kernel_flush
|
||||
mov lr, pc
|
||||
bx r1
|
||||
|
||||
skip_l2_inval:
|
||||
/* Data memory barrier and Data sync barrier */
|
||||
mov r1, #0
|
||||
|
@ -668,5 +614,7 @@ cache_pred_disable_mask:
|
|||
.word 0xFFFFE7FB
|
||||
control_stat:
|
||||
.word CONTROL_STAT
|
||||
kernel_flush:
|
||||
.word v7_flush_dcache_all
|
||||
ENTRY(omap34xx_cpu_suspend_sz)
|
||||
.word . - omap34xx_cpu_suspend
|
||||
|
|
Загрузка…
Ссылка в новой задаче