Add sk_float_rsqrt with SSE + NEON fast paths.

Current numbers:

N4:
running bench [640 480]      math_fastIsqrt    NONRENDERING: cmsecs =      3.12
running bench [640 480]      math_slowIsqrt    NONRENDERING: cmsecs =      4.82
running bench [640 480] math_sk_float_rsqrt    NONRENDERING: cmsecs =      1.99

Desktop:
running bench [640 480]      math_fastIsqrt    NONRENDERING: cmsecs =      0.89
running bench [640 480]      math_slowIsqrt    NONRENDERING: cmsecs =      0.94
running bench [640 480] math_sk_float_rsqrt    NONRENDERING: cmsecs =      0.09

Haven't found any other benches where this is a significant effect yet.

BUG=
R=reed@google.com

Author: mtklein@google.com

Review URL: https://codereview.chromium.org/60083014

git-svn-id: http://skia.googlecode.com/svn/trunk@12203 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
commit-bot@chromium.org 2013-11-08 20:14:16 +00:00
Родитель 8f457e3230
Коммит 11e5b972a9
5 изменённых файлов: 108 добавлений и 294 удалений

Просмотреть файл

@ -92,6 +92,22 @@ private:
typedef MathBench INHERITED;
};
class SkRSqrtMathBench : public MathBench {
public:
SkRSqrtMathBench() : INHERITED("sk_float_rsqrt") {}
protected:
virtual void performTest(float* SK_RESTRICT dst,
const float* SK_RESTRICT src,
int count) {
for (int i = 0; i < count; ++i) {
dst[i] = sk_float_rsqrt(src[i]);
}
}
private:
typedef MathBench INHERITED;
};
class SlowISqrtMathBench : public MathBench {
public:
SlowISqrtMathBench() : INHERITED("slowIsqrt") {}
@ -550,6 +566,7 @@ DEF_BENCH(return new DivModBench<int64_t>("int64_t"))
///////////////////////////////////////////////////////////////////////////////
DEF_BENCH( return new NoOpMathBench(); )
DEF_BENCH( return new SkRSqrtMathBench(); )
DEF_BENCH( return new SlowISqrtMathBench(); )
DEF_BENCH( return new FastISqrtMathBench(); )
DEF_BENCH( return new QMul64Bench(); )

Просмотреть файл

@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity;
#define SK_FloatNaN (*SkTCast<const float*>(&gIEEENotANumber))
#define SK_FloatInfinity (*SkTCast<const float*>(&gIEEEInfinity))
#define SK_FloatNegativeInfinity (*SkTCast<const float*>(&gIEEENegativeInfinity))
#if defined(__SSE__)
#include <xmmintrin.h>
#elif defined(__ARM_NEON__)
#include <arm_neon.h>
#endif
// Fast, approximate inverse square root.
// Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON.
static inline float sk_float_rsqrt(const float x) {
// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got
// it at compile time. This is going to be too fast to productively hide behind a function pointer.
//
// We do one step of Newton's method to refine the estimates in the NEON and null paths. No
// refinement is faster, but very innacurate. Two steps is more accurate, but slower than 1/sqrt.
#if defined(__SSE__)
float result;
_mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x)));
return result;
#elif defined(__ARM_NEON__)
// Get initial estimate.
const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doing everything 2x.
float32x2_t estimate = vrsqrte_f32(xx);
// One step of Newton's method to refine.
const float32x2_t estimate_sq = vmul_f32(estimate, estimate);
estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq));
return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places.
#else
// Get initial estimate.
int i = *SkTCast<int*>(&x);
i = 0x5f3759df - (i>>1);
float estimate = *SkTCast<float*>(&i);
// One step of Newton's method to refine.
const float estimate_sq = estimate*estimate;
estimate *= (1.5f-0.5f*x*estimate_sq);
return estimate;
#endif
}
#endif

Просмотреть файл

@ -216,13 +216,10 @@ struct SK_API SkPoint {
* Return true if the computed length of the vector is >= the internal
* tolerance (used to avoid dividing by tiny values).
*/
static bool CanNormalize(SkScalar dx, SkScalar dy)
#ifdef SK_SCALAR_IS_FLOAT
// Simple enough (and performance critical sometimes) so we inline it.
{ return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero); }
#else
;
#endif
static bool CanNormalize(SkScalar dx, SkScalar dy) {
// Simple enough (and performance critical sometimes) so we inline it.
return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero);
}
bool canNormalize() const {
return CanNormalize(fX, fY);
@ -252,6 +249,14 @@ struct SK_API SkPoint {
*/
bool setLength(SkScalar x, SkScalar y, SkScalar length);
/** Same as setLength, but favoring speed over accuracy.
*/
bool setLengthFast(SkScalar length);
/** Same as setLength, but favoring speed over accuracy.
*/
bool setLengthFast(SkScalar x, SkScalar y, SkScalar length);
/** Scale the point's coordinates by scale, writing the answer into dst.
It is legal for dst == this.
*/
@ -316,7 +321,6 @@ struct SK_API SkPoint {
* Returns true if both X and Y are finite (not infinity or NaN)
*/
bool isFinite() const {
#ifdef SK_SCALAR_IS_FLOAT
SkScalar accum = 0;
accum *= fX;
accum *= fY;
@ -327,12 +331,6 @@ struct SK_API SkPoint {
// value==value will be true iff value is not NaN
// TODO: is it faster to say !accum or accum==accum?
return accum == accum;
#else
// use bit-or for speed, since we don't care about short-circuting the
// tests, and we expect the common case will be that we need to check all.
int isNaN = (SK_FixedNaN == fX) | (SK_FixedNaN == fX));
return !isNaN;
#endif
}
/**

Просмотреть файл

@ -87,8 +87,6 @@ bool SkPoint::setLength(SkScalar length) {
return this->setLength(fX, fY, length);
}
#ifdef SK_SCALAR_IS_FLOAT
// Returns the square of the Euclidian distance to (dx,dy).
static inline float getLengthSquared(float dx, float dy) {
return dx * dx + dy * dy;
@ -177,290 +175,32 @@ bool SkPoint::setLength(float x, float y, float length) {
return true;
}
#else
#include "Sk64.h"
// Returns the square of the Euclidian distance to (dx,dy) in *result.
static inline void getLengthSquared(SkScalar dx, SkScalar dy, Sk64 *result) {
Sk64 dySqr;
result->setMul(dx, dx);
dySqr.setMul(dy, dy);
result->add(dySqr);
bool SkPoint::setLengthFast(float length) {
return this->setLengthFast(fX, fY, length);
}
// Calculates the square of the Euclidian distance to (dx,dy) and stores it in
// *lengthSquared. Returns true if the distance is judged to be "nearly zero".
//
// This logic is encapsulated in a helper method to make it explicit that we
// always perform this check in the same manner, to avoid inconsistencies
// (see http://code.google.com/p/skia/issues/detail?id=560 ).
static inline bool isLengthNearlyZero(SkScalar dx, SkScalar dy,
Sk64 *lengthSquared) {
Sk64 tolSqr;
getLengthSquared(dx, dy, lengthSquared);
// we want nearlyzero^2, but to compute it fast we want to just do a
// 32bit multiply, so we require that it not exceed 31bits. That is true
// if nearlyzero is <= 0xB504, which should be trivial, since usually
// nearlyzero is a very small fixed-point value.
SkASSERT(SK_ScalarNearlyZero <= 0xB504);
tolSqr.set(0, SK_ScalarNearlyZero * SK_ScalarNearlyZero);
return *lengthSquared <= tolSqr;
}
SkScalar SkPoint::Normalize(SkPoint* pt) {
Sk64 mag2;
if (!isLengthNearlyZero(pt->fX, pt->fY, &mag2)) {
SkScalar mag = mag2.getSqrt();
SkScalar scale = SkScalarInvert(mag);
pt->fX = SkScalarMul(pt->fX, scale);
pt->fY = SkScalarMul(pt->fY, scale);
return mag;
}
return 0;
}
bool SkPoint::CanNormalize(SkScalar dx, SkScalar dy) {
Sk64 mag2_unused;
return !isLengthNearlyZero(dx, dy, &mag2_unused);
}
SkScalar SkPoint::Length(SkScalar dx, SkScalar dy) {
Sk64 tmp;
getLengthSquared(dx, dy, &tmp);
return tmp.getSqrt();
}
#ifdef SK_DEBUGx
static SkFixed fixlen(SkFixed x, SkFixed y) {
float fx = (float)x;
float fy = (float)y;
return (int)floorf(sqrtf(fx*fx + fy*fy) + 0.5f);
}
#endif
static inline uint32_t squarefixed(unsigned x) {
x >>= 16;
return x*x;
}
#if 1 // Newton iter for setLength
static inline unsigned invsqrt_iter(unsigned V, unsigned U) {
unsigned x = V * U >> 14;
x = x * U >> 14;
x = (3 << 14) - x;
x = (U >> 1) * x >> 14;
return x;
}
static const uint16_t gInvSqrt14GuessTable[] = {
0x4000, 0x3c57, 0x393e, 0x3695, 0x3441, 0x3235, 0x3061,
0x2ebd, 0x2d41, 0x2be7, 0x2aaa, 0x2987, 0x287a, 0x2780,
0x2698, 0x25be, 0x24f3, 0x2434, 0x2380, 0x22d6, 0x2235,
0x219d, 0x210c, 0x2083, 0x2000, 0x1f82, 0x1f0b, 0x1e99,
0x1e2b, 0x1dc2, 0x1d5d, 0x1cfc, 0x1c9f, 0x1c45, 0x1bee,
0x1b9b, 0x1b4a, 0x1afc, 0x1ab0, 0x1a67, 0x1a20, 0x19dc,
0x1999, 0x1959, 0x191a, 0x18dd, 0x18a2, 0x1868, 0x1830,
0x17fa, 0x17c4, 0x1791, 0x175e, 0x172d, 0x16fd, 0x16ce
};
#define BUILD_INVSQRT_TABLEx
#ifdef BUILD_INVSQRT_TABLE
static void build_invsqrt14_guess_table() {
for (int i = 8; i <= 63; i++) {
unsigned x = SkToU16((1 << 28) / SkSqrt32(i << 25));
printf("0x%x, ", x);
}
printf("\n");
}
#endif
static unsigned fast_invsqrt(uint32_t x) {
#ifdef BUILD_INVSQRT_TABLE
unsigned top2 = x >> 25;
SkASSERT(top2 >= 8 && top2 <= 63);
static bool gOnce;
if (!gOnce) {
build_invsqrt14_guess_table();
gOnce = true;
}
#endif
unsigned V = x >> 14; // make V .14
unsigned top = x >> 25;
SkASSERT(top >= 8 && top <= 63);
SkASSERT(top - 8 < SK_ARRAY_COUNT(gInvSqrt14GuessTable));
unsigned U = gInvSqrt14GuessTable[top - 8];
U = invsqrt_iter(V, U);
return invsqrt_iter(V, U);
}
/* We "normalize" x,y to be .14 values (so we can square them and stay 32bits.
Then we Newton-iterate this in .14 space to compute the invser-sqrt, and
scale by it at the end. The .14 space means we can execute our iterations
and stay in 32bits as well, making the multiplies much cheaper than calling
SkFixedMul.
*/
bool SkPoint::setLength(SkFixed ox, SkFixed oy, SkFixed length) {
if (ox == 0) {
if (oy == 0) {
return false;
}
this->set(0, SkApplySign(length, SkExtractSign(oy)));
return true;
}
if (oy == 0) {
this->set(SkApplySign(length, SkExtractSign(ox)), 0);
return true;
bool SkPoint::setLengthFast(float x, float y, float length) {
float mag2;
if (isLengthNearlyZero(x, y, &mag2)) {
return false;
}
unsigned x = SkAbs32(ox);
unsigned y = SkAbs32(oy);
int zeros = SkCLZ(x | y);
// make x,y 1.14 values so our fast sqr won't overflow
if (zeros > 17) {
x <<= zeros - 17;
y <<= zeros - 17;
float scale;
if (SkScalarIsFinite(mag2)) {
scale = length * sk_float_rsqrt(mag2); // <--- this is the difference
} else {
x >>= 17 - zeros;
y >>= 17 - zeros;
// our mag2 step overflowed to infinity, so use doubles instead.
// much slower, but needed when x or y are very large, other wise we
// divide by inf. and return (0,0) vector.
double xx = x;
double yy = y;
scale = (float)(length / sqrt(xx * xx + yy * yy));
}
SkASSERT((x | y) <= 0x7FFF);
unsigned invrt = fast_invsqrt(x*x + y*y);
x = x * invrt >> 12;
y = y * invrt >> 12;
if (length != SK_Fixed1) {
x = SkFixedMul(x, length);
y = SkFixedMul(y, length);
}
this->set(SkApplySign(x, SkExtractSign(ox)),
SkApplySign(y, SkExtractSign(oy)));
fX = x * scale;
fY = y * scale;
return true;
}
#else
/*
Normalize x,y, and then scale them by length.
The obvious way to do this would be the following:
S64 tmp1, tmp2;
tmp1.setMul(x,x);
tmp2.setMul(y,y);
tmp1.add(tmp2);
len = tmp1.getSqrt();
x' = SkFixedDiv(x, len);
y' = SkFixedDiv(y, len);
This is fine, but slower than what we do below.
The present technique does not compute the starting length, but
rather fiddles with x,y iteratively, all the while checking its
magnitude^2 (avoiding a sqrt).
We normalize by first shifting x,y so that at least one of them
has bit 31 set (after taking the abs of them).
Then we loop, refining x,y by squaring them and comparing
against a very large 1.0 (1 << 28), and then adding or subtracting
a delta (which itself is reduced by half each time through the loop).
For speed we want the squaring to be with a simple integer mul. To keep
that from overflowing we shift our coordinates down until we are dealing
with at most 15 bits (2^15-1)^2 * 2 says withing 32 bits)
When our square is close to 1.0, we shift x,y down into fixed range.
*/
bool SkPoint::setLength(SkFixed ox, SkFixed oy, SkFixed length) {
if (ox == 0) {
if (oy == 0)
return false;
this->set(0, SkApplySign(length, SkExtractSign(oy)));
return true;
}
if (oy == 0) {
this->set(SkApplySign(length, SkExtractSign(ox)), 0);
return true;
}
SkFixed x = SkAbs32(ox);
SkFixed y = SkAbs32(oy);
// shift x,y so that the greater of them is 15bits (1.14 fixed point)
{
int shift = SkCLZ(x | y);
// make them .30
x <<= shift - 1;
y <<= shift - 1;
}
SkFixed dx = x;
SkFixed dy = y;
for (int i = 0; i < 17; i++) {
dx >>= 1;
dy >>= 1;
U32 len2 = squarefixed(x) + squarefixed(y);
if (len2 >> 28) {
x -= dx;
y -= dy;
} else {
x += dx;
y += dy;
}
}
x >>= 14;
y >>= 14;
#ifdef SK_DEBUGx // measure how far we are from unit-length
{
static int gMaxError;
static int gMaxDiff;
SkFixed len = fixlen(x, y);
int err = len - SK_Fixed1;
err = SkAbs32(err);
if (err > gMaxError) {
gMaxError = err;
SkDebugf("gMaxError %d\n", err);
}
float fx = SkAbs32(ox)/65536.0f;
float fy = SkAbs32(oy)/65536.0f;
float mag = sqrtf(fx*fx + fy*fy);
fx /= mag;
fy /= mag;
SkFixed xx = (int)floorf(fx * 65536 + 0.5f);
SkFixed yy = (int)floorf(fy * 65536 + 0.5f);
err = SkMax32(SkAbs32(xx-x), SkAbs32(yy-y));
if (err > gMaxDiff) {
gMaxDiff = err;
SkDebugf("gMaxDiff %d\n", err);
}
}
#endif
x = SkApplySign(x, SkExtractSign(ox));
y = SkApplySign(y, SkExtractSign(oy));
if (length != SK_Fixed1) {
x = SkFixedMul(x, length);
y = SkFixedMul(y, length);
}
this->set(x, y);
return true;
}
#endif
#endif
///////////////////////////////////////////////////////////////////////////////

Просмотреть файл

@ -117,7 +117,8 @@ static void test_underflow(skiatest::Reporter* reporter) {
REPORTER_ASSERT(reporter, pt == copy); // pt is unchanged
}
static void PointTest(skiatest::Reporter* reporter) {
#include "TestClassDef.h"
DEF_TEST(Point, reporter) {
test_casts(reporter);
static const struct {
@ -137,5 +138,22 @@ static void PointTest(skiatest::Reporter* reporter) {
test_overflow(reporter);
}
#include "TestClassDef.h"
DEFINE_TESTCLASS("Point", PointTestClass, PointTest)
DEF_TEST(Point_setLengthFast, reporter) {
// Scale a (1,1) point to a bunch of different lengths,
// making sure the slow and fast paths are within 0.1%.
const float tests[] = { 1.0f, 0.0f, 1.0e-37f, 3.4e38f, 42.0f, 0.00012f };
const SkPoint kOne = {1.0f, 1.0f};
for (unsigned i = 0; i < SK_ARRAY_COUNT(tests); i++) {
SkPoint slow = kOne, fast = kOne;
slow.setLength(tests[i]);
fast.setLengthFast(tests[i]);
if (slow.length() < FLT_MIN && fast.length() < FLT_MIN) continue;
SkScalar ratio = slow.length() / fast.length();
REPORTER_ASSERT(reporter, ratio > 0.999f);
REPORTER_ASSERT(reporter, ratio < 1.001f);
}
}