зеркало из https://github.com/mozilla/moz-skia.git
Add sk_float_rsqrt with SSE + NEON fast paths.
Current numbers: N4: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 3.12 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 4.82 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 1.99 Desktop: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 0.89 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 0.94 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 0.09 Haven't found any other benches where this is a significant effect yet. BUG= R=reed@google.com Author: mtklein@google.com Review URL: https://codereview.chromium.org/60083014 git-svn-id: http://skia.googlecode.com/svn/trunk@12203 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
Родитель
8f457e3230
Коммит
11e5b972a9
|
@ -92,6 +92,22 @@ private:
|
|||
typedef MathBench INHERITED;
|
||||
};
|
||||
|
||||
class SkRSqrtMathBench : public MathBench {
|
||||
public:
|
||||
SkRSqrtMathBench() : INHERITED("sk_float_rsqrt") {}
|
||||
protected:
|
||||
virtual void performTest(float* SK_RESTRICT dst,
|
||||
const float* SK_RESTRICT src,
|
||||
int count) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
dst[i] = sk_float_rsqrt(src[i]);
|
||||
}
|
||||
}
|
||||
private:
|
||||
typedef MathBench INHERITED;
|
||||
};
|
||||
|
||||
|
||||
class SlowISqrtMathBench : public MathBench {
|
||||
public:
|
||||
SlowISqrtMathBench() : INHERITED("slowIsqrt") {}
|
||||
|
@ -550,6 +566,7 @@ DEF_BENCH(return new DivModBench<int64_t>("int64_t"))
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
DEF_BENCH( return new NoOpMathBench(); )
|
||||
DEF_BENCH( return new SkRSqrtMathBench(); )
|
||||
DEF_BENCH( return new SlowISqrtMathBench(); )
|
||||
DEF_BENCH( return new FastISqrtMathBench(); )
|
||||
DEF_BENCH( return new QMul64Bench(); )
|
||||
|
|
|
@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity;
|
|||
#define SK_FloatNaN (*SkTCast<const float*>(&gIEEENotANumber))
|
||||
#define SK_FloatInfinity (*SkTCast<const float*>(&gIEEEInfinity))
|
||||
#define SK_FloatNegativeInfinity (*SkTCast<const float*>(&gIEEENegativeInfinity))
|
||||
|
||||
#if defined(__SSE__)
|
||||
#include <xmmintrin.h>
|
||||
#elif defined(__ARM_NEON__)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
// Fast, approximate inverse square root.
|
||||
// Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON.
|
||||
static inline float sk_float_rsqrt(const float x) {
|
||||
// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got
|
||||
// it at compile time. This is going to be too fast to productively hide behind a function pointer.
|
||||
//
|
||||
// We do one step of Newton's method to refine the estimates in the NEON and null paths. No
|
||||
// refinement is faster, but very innacurate. Two steps is more accurate, but slower than 1/sqrt.
|
||||
#if defined(__SSE__)
|
||||
float result;
|
||||
_mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x)));
|
||||
return result;
|
||||
#elif defined(__ARM_NEON__)
|
||||
// Get initial estimate.
|
||||
const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doing everything 2x.
|
||||
float32x2_t estimate = vrsqrte_f32(xx);
|
||||
|
||||
// One step of Newton's method to refine.
|
||||
const float32x2_t estimate_sq = vmul_f32(estimate, estimate);
|
||||
estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq));
|
||||
return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places.
|
||||
#else
|
||||
// Get initial estimate.
|
||||
int i = *SkTCast<int*>(&x);
|
||||
i = 0x5f3759df - (i>>1);
|
||||
float estimate = *SkTCast<float*>(&i);
|
||||
|
||||
// One step of Newton's method to refine.
|
||||
const float estimate_sq = estimate*estimate;
|
||||
estimate *= (1.5f-0.5f*x*estimate_sq);
|
||||
return estimate;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -216,13 +216,10 @@ struct SK_API SkPoint {
|
|||
* Return true if the computed length of the vector is >= the internal
|
||||
* tolerance (used to avoid dividing by tiny values).
|
||||
*/
|
||||
static bool CanNormalize(SkScalar dx, SkScalar dy)
|
||||
#ifdef SK_SCALAR_IS_FLOAT
|
||||
// Simple enough (and performance critical sometimes) so we inline it.
|
||||
{ return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero); }
|
||||
#else
|
||||
;
|
||||
#endif
|
||||
static bool CanNormalize(SkScalar dx, SkScalar dy) {
|
||||
// Simple enough (and performance critical sometimes) so we inline it.
|
||||
return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero);
|
||||
}
|
||||
|
||||
bool canNormalize() const {
|
||||
return CanNormalize(fX, fY);
|
||||
|
@ -252,6 +249,14 @@ struct SK_API SkPoint {
|
|||
*/
|
||||
bool setLength(SkScalar x, SkScalar y, SkScalar length);
|
||||
|
||||
/** Same as setLength, but favoring speed over accuracy.
|
||||
*/
|
||||
bool setLengthFast(SkScalar length);
|
||||
|
||||
/** Same as setLength, but favoring speed over accuracy.
|
||||
*/
|
||||
bool setLengthFast(SkScalar x, SkScalar y, SkScalar length);
|
||||
|
||||
/** Scale the point's coordinates by scale, writing the answer into dst.
|
||||
It is legal for dst == this.
|
||||
*/
|
||||
|
@ -316,7 +321,6 @@ struct SK_API SkPoint {
|
|||
* Returns true if both X and Y are finite (not infinity or NaN)
|
||||
*/
|
||||
bool isFinite() const {
|
||||
#ifdef SK_SCALAR_IS_FLOAT
|
||||
SkScalar accum = 0;
|
||||
accum *= fX;
|
||||
accum *= fY;
|
||||
|
@ -327,12 +331,6 @@ struct SK_API SkPoint {
|
|||
// value==value will be true iff value is not NaN
|
||||
// TODO: is it faster to say !accum or accum==accum?
|
||||
return accum == accum;
|
||||
#else
|
||||
// use bit-or for speed, since we don't care about short-circuting the
|
||||
// tests, and we expect the common case will be that we need to check all.
|
||||
int isNaN = (SK_FixedNaN == fX) | (SK_FixedNaN == fX));
|
||||
return !isNaN;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -87,8 +87,6 @@ bool SkPoint::setLength(SkScalar length) {
|
|||
return this->setLength(fX, fY, length);
|
||||
}
|
||||
|
||||
#ifdef SK_SCALAR_IS_FLOAT
|
||||
|
||||
// Returns the square of the Euclidian distance to (dx,dy).
|
||||
static inline float getLengthSquared(float dx, float dy) {
|
||||
return dx * dx + dy * dy;
|
||||
|
@ -177,290 +175,32 @@ bool SkPoint::setLength(float x, float y, float length) {
|
|||
return true;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "Sk64.h"
|
||||
|
||||
// Returns the square of the Euclidian distance to (dx,dy) in *result.
|
||||
static inline void getLengthSquared(SkScalar dx, SkScalar dy, Sk64 *result) {
|
||||
Sk64 dySqr;
|
||||
|
||||
result->setMul(dx, dx);
|
||||
dySqr.setMul(dy, dy);
|
||||
result->add(dySqr);
|
||||
bool SkPoint::setLengthFast(float length) {
|
||||
return this->setLengthFast(fX, fY, length);
|
||||
}
|
||||
|
||||
// Calculates the square of the Euclidian distance to (dx,dy) and stores it in
|
||||
// *lengthSquared. Returns true if the distance is judged to be "nearly zero".
|
||||
//
|
||||
// This logic is encapsulated in a helper method to make it explicit that we
|
||||
// always perform this check in the same manner, to avoid inconsistencies
|
||||
// (see http://code.google.com/p/skia/issues/detail?id=560 ).
|
||||
static inline bool isLengthNearlyZero(SkScalar dx, SkScalar dy,
|
||||
Sk64 *lengthSquared) {
|
||||
Sk64 tolSqr;
|
||||
getLengthSquared(dx, dy, lengthSquared);
|
||||
|
||||
// we want nearlyzero^2, but to compute it fast we want to just do a
|
||||
// 32bit multiply, so we require that it not exceed 31bits. That is true
|
||||
// if nearlyzero is <= 0xB504, which should be trivial, since usually
|
||||
// nearlyzero is a very small fixed-point value.
|
||||
SkASSERT(SK_ScalarNearlyZero <= 0xB504);
|
||||
|
||||
tolSqr.set(0, SK_ScalarNearlyZero * SK_ScalarNearlyZero);
|
||||
return *lengthSquared <= tolSqr;
|
||||
}
|
||||
|
||||
SkScalar SkPoint::Normalize(SkPoint* pt) {
|
||||
Sk64 mag2;
|
||||
if (!isLengthNearlyZero(pt->fX, pt->fY, &mag2)) {
|
||||
SkScalar mag = mag2.getSqrt();
|
||||
SkScalar scale = SkScalarInvert(mag);
|
||||
pt->fX = SkScalarMul(pt->fX, scale);
|
||||
pt->fY = SkScalarMul(pt->fY, scale);
|
||||
return mag;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool SkPoint::CanNormalize(SkScalar dx, SkScalar dy) {
|
||||
Sk64 mag2_unused;
|
||||
return !isLengthNearlyZero(dx, dy, &mag2_unused);
|
||||
}
|
||||
|
||||
SkScalar SkPoint::Length(SkScalar dx, SkScalar dy) {
|
||||
Sk64 tmp;
|
||||
getLengthSquared(dx, dy, &tmp);
|
||||
return tmp.getSqrt();
|
||||
}
|
||||
|
||||
#ifdef SK_DEBUGx
|
||||
static SkFixed fixlen(SkFixed x, SkFixed y) {
|
||||
float fx = (float)x;
|
||||
float fy = (float)y;
|
||||
|
||||
return (int)floorf(sqrtf(fx*fx + fy*fy) + 0.5f);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint32_t squarefixed(unsigned x) {
|
||||
x >>= 16;
|
||||
return x*x;
|
||||
}
|
||||
|
||||
#if 1 // Newton iter for setLength
|
||||
|
||||
static inline unsigned invsqrt_iter(unsigned V, unsigned U) {
|
||||
unsigned x = V * U >> 14;
|
||||
x = x * U >> 14;
|
||||
x = (3 << 14) - x;
|
||||
x = (U >> 1) * x >> 14;
|
||||
return x;
|
||||
}
|
||||
|
||||
static const uint16_t gInvSqrt14GuessTable[] = {
|
||||
0x4000, 0x3c57, 0x393e, 0x3695, 0x3441, 0x3235, 0x3061,
|
||||
0x2ebd, 0x2d41, 0x2be7, 0x2aaa, 0x2987, 0x287a, 0x2780,
|
||||
0x2698, 0x25be, 0x24f3, 0x2434, 0x2380, 0x22d6, 0x2235,
|
||||
0x219d, 0x210c, 0x2083, 0x2000, 0x1f82, 0x1f0b, 0x1e99,
|
||||
0x1e2b, 0x1dc2, 0x1d5d, 0x1cfc, 0x1c9f, 0x1c45, 0x1bee,
|
||||
0x1b9b, 0x1b4a, 0x1afc, 0x1ab0, 0x1a67, 0x1a20, 0x19dc,
|
||||
0x1999, 0x1959, 0x191a, 0x18dd, 0x18a2, 0x1868, 0x1830,
|
||||
0x17fa, 0x17c4, 0x1791, 0x175e, 0x172d, 0x16fd, 0x16ce
|
||||
};
|
||||
|
||||
#define BUILD_INVSQRT_TABLEx
|
||||
#ifdef BUILD_INVSQRT_TABLE
|
||||
static void build_invsqrt14_guess_table() {
|
||||
for (int i = 8; i <= 63; i++) {
|
||||
unsigned x = SkToU16((1 << 28) / SkSqrt32(i << 25));
|
||||
printf("0x%x, ", x);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned fast_invsqrt(uint32_t x) {
|
||||
#ifdef BUILD_INVSQRT_TABLE
|
||||
unsigned top2 = x >> 25;
|
||||
SkASSERT(top2 >= 8 && top2 <= 63);
|
||||
|
||||
static bool gOnce;
|
||||
if (!gOnce) {
|
||||
build_invsqrt14_guess_table();
|
||||
gOnce = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
unsigned V = x >> 14; // make V .14
|
||||
|
||||
unsigned top = x >> 25;
|
||||
SkASSERT(top >= 8 && top <= 63);
|
||||
SkASSERT(top - 8 < SK_ARRAY_COUNT(gInvSqrt14GuessTable));
|
||||
unsigned U = gInvSqrt14GuessTable[top - 8];
|
||||
|
||||
U = invsqrt_iter(V, U);
|
||||
return invsqrt_iter(V, U);
|
||||
}
|
||||
|
||||
/* We "normalize" x,y to be .14 values (so we can square them and stay 32bits.
|
||||
Then we Newton-iterate this in .14 space to compute the invser-sqrt, and
|
||||
scale by it at the end. The .14 space means we can execute our iterations
|
||||
and stay in 32bits as well, making the multiplies much cheaper than calling
|
||||
SkFixedMul.
|
||||
*/
|
||||
bool SkPoint::setLength(SkFixed ox, SkFixed oy, SkFixed length) {
|
||||
if (ox == 0) {
|
||||
if (oy == 0) {
|
||||
return false;
|
||||
}
|
||||
this->set(0, SkApplySign(length, SkExtractSign(oy)));
|
||||
return true;
|
||||
}
|
||||
if (oy == 0) {
|
||||
this->set(SkApplySign(length, SkExtractSign(ox)), 0);
|
||||
return true;
|
||||
bool SkPoint::setLengthFast(float x, float y, float length) {
|
||||
float mag2;
|
||||
if (isLengthNearlyZero(x, y, &mag2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned x = SkAbs32(ox);
|
||||
unsigned y = SkAbs32(oy);
|
||||
int zeros = SkCLZ(x | y);
|
||||
|
||||
// make x,y 1.14 values so our fast sqr won't overflow
|
||||
if (zeros > 17) {
|
||||
x <<= zeros - 17;
|
||||
y <<= zeros - 17;
|
||||
float scale;
|
||||
if (SkScalarIsFinite(mag2)) {
|
||||
scale = length * sk_float_rsqrt(mag2); // <--- this is the difference
|
||||
} else {
|
||||
x >>= 17 - zeros;
|
||||
y >>= 17 - zeros;
|
||||
// our mag2 step overflowed to infinity, so use doubles instead.
|
||||
// much slower, but needed when x or y are very large, other wise we
|
||||
// divide by inf. and return (0,0) vector.
|
||||
double xx = x;
|
||||
double yy = y;
|
||||
scale = (float)(length / sqrt(xx * xx + yy * yy));
|
||||
}
|
||||
SkASSERT((x | y) <= 0x7FFF);
|
||||
|
||||
unsigned invrt = fast_invsqrt(x*x + y*y);
|
||||
|
||||
x = x * invrt >> 12;
|
||||
y = y * invrt >> 12;
|
||||
|
||||
if (length != SK_Fixed1) {
|
||||
x = SkFixedMul(x, length);
|
||||
y = SkFixedMul(y, length);
|
||||
}
|
||||
this->set(SkApplySign(x, SkExtractSign(ox)),
|
||||
SkApplySign(y, SkExtractSign(oy)));
|
||||
fX = x * scale;
|
||||
fY = y * scale;
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
Normalize x,y, and then scale them by length.
|
||||
|
||||
The obvious way to do this would be the following:
|
||||
S64 tmp1, tmp2;
|
||||
tmp1.setMul(x,x);
|
||||
tmp2.setMul(y,y);
|
||||
tmp1.add(tmp2);
|
||||
len = tmp1.getSqrt();
|
||||
x' = SkFixedDiv(x, len);
|
||||
y' = SkFixedDiv(y, len);
|
||||
This is fine, but slower than what we do below.
|
||||
|
||||
The present technique does not compute the starting length, but
|
||||
rather fiddles with x,y iteratively, all the while checking its
|
||||
magnitude^2 (avoiding a sqrt).
|
||||
|
||||
We normalize by first shifting x,y so that at least one of them
|
||||
has bit 31 set (after taking the abs of them).
|
||||
Then we loop, refining x,y by squaring them and comparing
|
||||
against a very large 1.0 (1 << 28), and then adding or subtracting
|
||||
a delta (which itself is reduced by half each time through the loop).
|
||||
For speed we want the squaring to be with a simple integer mul. To keep
|
||||
that from overflowing we shift our coordinates down until we are dealing
|
||||
with at most 15 bits (2^15-1)^2 * 2 says withing 32 bits)
|
||||
When our square is close to 1.0, we shift x,y down into fixed range.
|
||||
*/
|
||||
bool SkPoint::setLength(SkFixed ox, SkFixed oy, SkFixed length) {
|
||||
if (ox == 0) {
|
||||
if (oy == 0)
|
||||
return false;
|
||||
this->set(0, SkApplySign(length, SkExtractSign(oy)));
|
||||
return true;
|
||||
}
|
||||
if (oy == 0) {
|
||||
this->set(SkApplySign(length, SkExtractSign(ox)), 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
SkFixed x = SkAbs32(ox);
|
||||
SkFixed y = SkAbs32(oy);
|
||||
|
||||
// shift x,y so that the greater of them is 15bits (1.14 fixed point)
|
||||
{
|
||||
int shift = SkCLZ(x | y);
|
||||
// make them .30
|
||||
x <<= shift - 1;
|
||||
y <<= shift - 1;
|
||||
}
|
||||
|
||||
SkFixed dx = x;
|
||||
SkFixed dy = y;
|
||||
|
||||
for (int i = 0; i < 17; i++) {
|
||||
dx >>= 1;
|
||||
dy >>= 1;
|
||||
|
||||
U32 len2 = squarefixed(x) + squarefixed(y);
|
||||
if (len2 >> 28) {
|
||||
x -= dx;
|
||||
y -= dy;
|
||||
} else {
|
||||
x += dx;
|
||||
y += dy;
|
||||
}
|
||||
}
|
||||
x >>= 14;
|
||||
y >>= 14;
|
||||
|
||||
#ifdef SK_DEBUGx // measure how far we are from unit-length
|
||||
{
|
||||
static int gMaxError;
|
||||
static int gMaxDiff;
|
||||
|
||||
SkFixed len = fixlen(x, y);
|
||||
int err = len - SK_Fixed1;
|
||||
err = SkAbs32(err);
|
||||
|
||||
if (err > gMaxError) {
|
||||
gMaxError = err;
|
||||
SkDebugf("gMaxError %d\n", err);
|
||||
}
|
||||
|
||||
float fx = SkAbs32(ox)/65536.0f;
|
||||
float fy = SkAbs32(oy)/65536.0f;
|
||||
float mag = sqrtf(fx*fx + fy*fy);
|
||||
fx /= mag;
|
||||
fy /= mag;
|
||||
SkFixed xx = (int)floorf(fx * 65536 + 0.5f);
|
||||
SkFixed yy = (int)floorf(fy * 65536 + 0.5f);
|
||||
err = SkMax32(SkAbs32(xx-x), SkAbs32(yy-y));
|
||||
if (err > gMaxDiff) {
|
||||
gMaxDiff = err;
|
||||
SkDebugf("gMaxDiff %d\n", err);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
x = SkApplySign(x, SkExtractSign(ox));
|
||||
y = SkApplySign(y, SkExtractSign(oy));
|
||||
if (length != SK_Fixed1) {
|
||||
x = SkFixedMul(x, length);
|
||||
y = SkFixedMul(y, length);
|
||||
}
|
||||
|
||||
this->set(x, y);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
|
@ -117,7 +117,8 @@ static void test_underflow(skiatest::Reporter* reporter) {
|
|||
REPORTER_ASSERT(reporter, pt == copy); // pt is unchanged
|
||||
}
|
||||
|
||||
static void PointTest(skiatest::Reporter* reporter) {
|
||||
#include "TestClassDef.h"
|
||||
DEF_TEST(Point, reporter) {
|
||||
test_casts(reporter);
|
||||
|
||||
static const struct {
|
||||
|
@ -137,5 +138,22 @@ static void PointTest(skiatest::Reporter* reporter) {
|
|||
test_overflow(reporter);
|
||||
}
|
||||
|
||||
#include "TestClassDef.h"
|
||||
DEFINE_TESTCLASS("Point", PointTestClass, PointTest)
|
||||
DEF_TEST(Point_setLengthFast, reporter) {
|
||||
// Scale a (1,1) point to a bunch of different lengths,
|
||||
// making sure the slow and fast paths are within 0.1%.
|
||||
const float tests[] = { 1.0f, 0.0f, 1.0e-37f, 3.4e38f, 42.0f, 0.00012f };
|
||||
|
||||
const SkPoint kOne = {1.0f, 1.0f};
|
||||
for (unsigned i = 0; i < SK_ARRAY_COUNT(tests); i++) {
|
||||
SkPoint slow = kOne, fast = kOne;
|
||||
|
||||
slow.setLength(tests[i]);
|
||||
fast.setLengthFast(tests[i]);
|
||||
|
||||
if (slow.length() < FLT_MIN && fast.length() < FLT_MIN) continue;
|
||||
|
||||
SkScalar ratio = slow.length() / fast.length();
|
||||
REPORTER_ASSERT(reporter, ratio > 0.999f);
|
||||
REPORTER_ASSERT(reporter, ratio < 1.001f);
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче