зеркало из https://github.com/mozilla/moz-skia.git
ARM Skia NEON patches - 18 - Preparation work for BitmapProcState
BitmapProcState: clean a little and get rid of some asm replacing the apparently stupid dx+dx+dx leads to more instructions being generated. Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BitmapProcState: move code common to C and NEON to a separate header Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG= R=djsollen@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/21931002 git-svn-id: http://skia.googlecode.com/svn/trunk@11109 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
Родитель
fd47a121dc
Коммит
a8c09668f9
|
@ -9,13 +9,7 @@
|
|||
#include "SkShader.h"
|
||||
#include "SkUtils.h"
|
||||
#include "SkUtilsArm.h"
|
||||
|
||||
// Helper to ensure that when we shift down, we do it w/o sign-extension
|
||||
// so the caller doesn't have to manually mask off the top 16 bits
|
||||
//
|
||||
static unsigned SK_USHIFT16(unsigned x) {
|
||||
return x >> 16;
|
||||
}
|
||||
#include "SkBitmapProcState_utils.h"
|
||||
|
||||
/* returns 0...(n-1) given any x (positive or negative).
|
||||
|
||||
|
@ -36,35 +30,6 @@ static inline int sk_int_mod(int x, int n) {
|
|||
return x;
|
||||
}
|
||||
|
||||
/*
|
||||
* The decal_ functions require that
|
||||
* 1. dx > 0
|
||||
* 2. [fx, fx+dx, fx+2dx, fx+3dx, ... fx+(count-1)dx] are all <= maxX
|
||||
*
|
||||
* In addition, we use SkFractionalInt to keep more fractional precision than
|
||||
* just SkFixed, so we will abort the decal_ call if dx is very small, since
|
||||
* the decal_ function just operates on SkFixed. If that were changed, we could
|
||||
* skip the very_small test here.
|
||||
*/
|
||||
static inline bool can_truncate_to_fixed_for_decal(SkFractionalInt frX,
|
||||
SkFractionalInt frDx,
|
||||
int count, unsigned max) {
|
||||
SkFixed dx = SkFractionalIntToFixed(frDx);
|
||||
|
||||
// if decal_ kept SkFractionalInt precision, this would just be dx <= 0
|
||||
// I just made up the 1/256. Just don't want to perceive accumulated error
|
||||
// if we truncate frDx and lose its low bits.
|
||||
if (dx <= SK_Fixed1 / 256) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cast to unsigned so we don't have to check for negative values, which
|
||||
// will now appear as very large positive values, and thus fail our test!
|
||||
SkFixed fx = SkFractionalIntToFixed(frX);
|
||||
return (unsigned)SkFixedFloorToInt(fx) <= max &&
|
||||
(unsigned)SkFixedFloorToInt(fx + dx * (count - 1)) < max;
|
||||
}
|
||||
|
||||
void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
|
||||
void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
|
||||
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
#ifndef SkBitmapProcState_utils_DEFINED
|
||||
#define SkBitmapProcState_utils_DEFINED
|
||||
|
||||
// Helper to ensure that when we shift down, we do it w/o sign-extension
|
||||
// so the caller doesn't have to manually mask off the top 16 bits
|
||||
//
|
||||
static unsigned SK_USHIFT16(unsigned x) {
|
||||
return x >> 16;
|
||||
}
|
||||
|
||||
/*
|
||||
* The decal_ functions require that
|
||||
* 1. dx > 0
|
||||
* 2. [fx, fx+dx, fx+2dx, fx+3dx, ... fx+(count-1)dx] are all <= maxX
|
||||
*
|
||||
* In addition, we use SkFractionalInt to keep more fractional precision than
|
||||
* just SkFixed, so we will abort the decal_ call if dx is very small, since
|
||||
* the decal_ function just operates on SkFixed. If that were changed, we could
|
||||
* skip the very_small test here.
|
||||
*/
|
||||
static inline bool can_truncate_to_fixed_for_decal(SkFractionalInt frX,
|
||||
SkFractionalInt frDx,
|
||||
int count, unsigned max) {
|
||||
SkFixed dx = SkFractionalIntToFixed(frDx);
|
||||
|
||||
// if decal_ kept SkFractionalInt precision, this would just be dx <= 0
|
||||
// I just made up the 1/256. Just don't want to perceive accumulated error
|
||||
// if we truncate frDx and lose its low bits.
|
||||
if (dx <= SK_Fixed1 / 256) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cast to unsigned so we don't have to check for negative values, which
|
||||
// will now appear as very large positive values, and thus fail our test!
|
||||
SkFixed fx = SkFractionalIntToFixed(frX);
|
||||
return (unsigned)SkFixedFloorToInt(fx) <= max &&
|
||||
(unsigned)SkFixedFloorToInt(fx + dx * (count - 1)) < max;
|
||||
}
|
||||
|
||||
#endif /* #ifndef SkBitmapProcState_utils_DEFINED */
|
|
@ -8,6 +8,7 @@
|
|||
#include "SkPerspIter.h"
|
||||
#include "SkShader.h"
|
||||
#include "SkUtilsArm.h"
|
||||
#include "SkBitmapProcState_utils.h"
|
||||
|
||||
extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
|
||||
extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
|
||||
|
@ -15,10 +16,6 @@ extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
|
|||
static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
|
||||
static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
|
||||
|
||||
static unsigned SK_USHIFT16(unsigned x) {
|
||||
return x >> 16;
|
||||
}
|
||||
|
||||
#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon
|
||||
#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
|
||||
#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
|
||||
|
@ -35,93 +32,72 @@ static unsigned SK_USHIFT16(unsigned x) {
|
|||
#include "SkBitmapProcState_matrix_repeat_neon.h"
|
||||
|
||||
|
||||
void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
|
||||
{
|
||||
int i;
|
||||
|
||||
void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
|
||||
if (count >= 8) {
|
||||
/* SkFixed is 16.16 fixed point */
|
||||
SkFixed dx2 = dx+dx;
|
||||
SkFixed dx4 = dx2+dx2;
|
||||
SkFixed dx8 = dx4+dx4;
|
||||
// SkFixed is 16.16 fixed point
|
||||
SkFixed dx8 = dx * 8;
|
||||
int32x4_t vdx8 = vdupq_n_s32(dx8);
|
||||
|
||||
/* now build fx/fx+dx/fx+2dx/fx+3dx */
|
||||
SkFixed fx1, fx2, fx3;
|
||||
// setup lbase and hbase
|
||||
int32x4_t lbase, hbase;
|
||||
uint16_t *dst16 = (uint16_t *)dst;
|
||||
|
||||
fx1 = fx+dx;
|
||||
fx2 = fx1+dx;
|
||||
fx3 = fx2+dx;
|
||||
|
||||
/* avoid an 'lbase unitialized' warning */
|
||||
lbase = vdupq_n_s32(fx);
|
||||
lbase = vsetq_lane_s32(fx1, lbase, 1);
|
||||
lbase = vsetq_lane_s32(fx2, lbase, 2);
|
||||
lbase = vsetq_lane_s32(fx3, lbase, 3);
|
||||
hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
|
||||
lbase = vsetq_lane_s32(fx + dx, lbase, 1);
|
||||
lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2);
|
||||
lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3);
|
||||
hbase = lbase + vdupq_n_s32(4 * dx);
|
||||
|
||||
/* take upper 16 of each, store, and bump everything */
|
||||
do {
|
||||
int32x4_t lout, hout;
|
||||
uint16x8_t hi16;
|
||||
// store the upper 16 bits
|
||||
vst1q_u32(dst, vreinterpretq_u32_s16(
|
||||
vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1]
|
||||
));
|
||||
|
||||
lout = lbase;
|
||||
hout = hbase;
|
||||
/* gets hi's of all louts then hi's of all houts */
|
||||
asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
|
||||
hi16 = vreinterpretq_u16_s32(hout);
|
||||
vst1q_u16(dst16, hi16);
|
||||
|
||||
/* on to the next */
|
||||
lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
|
||||
hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
|
||||
dst16 += 8;
|
||||
// on to the next group of 8
|
||||
lbase += vdx8;
|
||||
hbase += vdx8;
|
||||
dst += 4; // we did 8 elements but the result is twice smaller
|
||||
count -= 8;
|
||||
fx += dx8;
|
||||
} while (count >= 8);
|
||||
dst = (uint32_t *) dst16;
|
||||
}
|
||||
|
||||
uint16_t* xx = (uint16_t*)dst;
|
||||
for (i = count; i > 0; --i) {
|
||||
for (int i = count; i > 0; --i) {
|
||||
*xx++ = SkToU16(fx >> 16); fx += dx;
|
||||
}
|
||||
}
|
||||
|
||||
void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
|
||||
{
|
||||
void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
|
||||
if (count >= 8) {
|
||||
int32x4_t wide_fx;
|
||||
int32x4_t wide_fx2;
|
||||
int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
|
||||
SkFixed dx8 = dx * 8;
|
||||
int32x4_t vdx8 = vdupq_n_s32(dx8);
|
||||
|
||||
int32x4_t wide_fx, wide_fx2;
|
||||
wide_fx = vdupq_n_s32(fx);
|
||||
wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
|
||||
wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
|
||||
wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
|
||||
wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1);
|
||||
wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2);
|
||||
wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3);
|
||||
|
||||
wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
|
||||
wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx));
|
||||
|
||||
while (count >= 8) {
|
||||
int32x4_t wide_out;
|
||||
int32x4_t wide_out2;
|
||||
|
||||
wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
|
||||
wide_out = vorrq_s32(wide_out,
|
||||
vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
|
||||
wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1));
|
||||
|
||||
wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
|
||||
wide_out2 = vorrq_s32(wide_out2,
|
||||
vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
|
||||
wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1));
|
||||
|
||||
vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
|
||||
vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
|
||||
|
||||
dst += 8;
|
||||
fx += dx*8;
|
||||
wide_fx = vaddq_s32(wide_fx, wide_dx8);
|
||||
wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
|
||||
fx += dx8;
|
||||
wide_fx += vdx8;
|
||||
wide_fx2 += vdx8;
|
||||
count -= 8;
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче