зеркало из https://github.com/mozilla/moz-skia.git
Rewrite memset benches, then use results to add a small-N optimization.
The benches for N <= 10 get around 2x faster on my N7 and N9. I believe this is because of the reduced function-call-then-function-pointer-call overhead on the N7, and additionally because it seems autovectorization beats our NEON code for small N on the N9. My desktop is unchanged, though that's probably because N=10 lies well within a region where memset's performance is essentially constant: N=100 takes only about 2x as long as N=1 and N=10, which perform nearly identically. BUG=skia: Review URL: https://codereview.chromium.org/1073863002
This commit is contained in:
Родитель
a1e41c6d9a
Коммит
9ff378b01b
|
@ -6,110 +6,79 @@
|
|||
*/
|
||||
|
||||
#include "Benchmark.h"
|
||||
#include "SkCanvas.h"
|
||||
#include "SkString.h"
|
||||
#include "SkTemplates.h"
|
||||
#include "SkUtils.h"
|
||||
|
||||
template <typename T, bool kInline>
|
||||
class MemsetBench : public Benchmark {
|
||||
SkString fName;
|
||||
|
||||
protected:
|
||||
int fMinSize;
|
||||
int fMaxSize;
|
||||
enum {
|
||||
kBufferSize = 10000,
|
||||
VALUE32 = 0x12345678,
|
||||
VALUE16 = 0x1234
|
||||
};
|
||||
|
||||
enum MemsetType {
|
||||
MEMSET16 = 16,
|
||||
MEMSET32 = 32
|
||||
};
|
||||
|
||||
public:
|
||||
MemsetBench(MemsetType type, int minSize, int maxSize) {
|
||||
SkASSERT((minSize < maxSize) && (maxSize <= kBufferSize));
|
||||
fMinSize = minSize;
|
||||
fMaxSize = maxSize;
|
||||
fName.printf("memset%d_%d_%d", type, minSize, maxSize);
|
||||
}
|
||||
explicit MemsetBench(int n)
|
||||
: fN(n)
|
||||
, fBuffer(n)
|
||||
, fName(SkStringPrintf("memset%d_%d%s", sizeof(T)*8, n, kInline ? "_inline" : "")) {}
|
||||
|
||||
bool isSuitableFor(Backend backend) override {
|
||||
return backend == kNonRendering_Backend;
|
||||
}
|
||||
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
|
||||
const char* onGetName() override { return fName.c_str(); }
|
||||
|
||||
virtual void performTest() = 0;
|
||||
|
||||
protected:
|
||||
const char* onGetName() override {
|
||||
return fName.c_str();
|
||||
}
|
||||
|
||||
void onDraw(const int loops, SkCanvas* canvas) override {
|
||||
for (int i = 0; i < loops; ++i) {
|
||||
this->performTest();
|
||||
}
|
||||
}
|
||||
void onDraw(const int loops, SkCanvas*) override;
|
||||
|
||||
private:
|
||||
typedef Benchmark INHERITED;
|
||||
int fN;
|
||||
SkAutoTMalloc<T> fBuffer;
|
||||
SkString fName;
|
||||
};
|
||||
|
||||
class Memset32Bench : public MemsetBench {
|
||||
uint32_t kBuffer[kBufferSize + 3];
|
||||
public:
|
||||
Memset32Bench(int minSize, int maxSize)
|
||||
: INHERITED(MEMSET32, minSize, maxSize) {}
|
||||
|
||||
protected:
|
||||
void performTest() override {
|
||||
for(int j = fMinSize; j < fMaxSize; ++j){
|
||||
sk_memset32(kBuffer, VALUE32, j);
|
||||
sk_memset32(kBuffer + 1, VALUE32, j);
|
||||
sk_memset32(kBuffer + 2, VALUE32, j);
|
||||
sk_memset32(kBuffer + 3, VALUE32, j);
|
||||
}
|
||||
template <> void MemsetBench<uint32_t, false>::onDraw(const int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
sk_memset32(fBuffer.get(), 0xFACEB004, fN);
|
||||
}
|
||||
private:
|
||||
typedef MemsetBench INHERITED;
|
||||
};
|
||||
}
|
||||
|
||||
class Memset16Bench : public MemsetBench {
|
||||
uint16_t kBuffer[kBufferSize + 7];
|
||||
public:
|
||||
Memset16Bench(int minSize, int maxSize)
|
||||
: INHERITED(MEMSET16, minSize, maxSize) {}
|
||||
|
||||
protected:
|
||||
void performTest() override {
|
||||
for(int j = fMinSize; j < fMaxSize; ++j){
|
||||
sk_memset16(kBuffer, VALUE16, j);
|
||||
sk_memset16(kBuffer + 1, VALUE16, j);
|
||||
sk_memset16(kBuffer + 2, VALUE16, j);
|
||||
sk_memset16(kBuffer + 3, VALUE16, j);
|
||||
sk_memset16(kBuffer + 4, VALUE16, j);
|
||||
sk_memset16(kBuffer + 5, VALUE16, j);
|
||||
sk_memset16(kBuffer + 6, VALUE16, j);
|
||||
sk_memset16(kBuffer + 7, VALUE16, j);
|
||||
}
|
||||
template <> void MemsetBench<uint16_t, false>::onDraw(const int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
sk_memset16(fBuffer.get(), 0x4973, fN);
|
||||
}
|
||||
private:
|
||||
typedef MemsetBench INHERITED;
|
||||
};
|
||||
}
|
||||
|
||||
DEF_BENCH(return new Memset32Bench(1, 600);)
|
||||
DEF_BENCH(return new Memset32Bench(600, 800);)
|
||||
DEF_BENCH(return new Memset32Bench(800, 1000);)
|
||||
DEF_BENCH(return new Memset32Bench(1000, 2000);)
|
||||
DEF_BENCH(return new Memset32Bench(2000, 3000);)
|
||||
DEF_BENCH(return new Memset32Bench(3000, 4000);)
|
||||
DEF_BENCH(return new Memset32Bench(4000, 5000);)
|
||||
template <typename T>
|
||||
static void memsetT(T* dst, T val, int n) {
|
||||
for (int i = 0; i < n; i++) { dst[i] = val; }
|
||||
}
|
||||
|
||||
DEF_BENCH(return new Memset16Bench(1, 600);)
|
||||
DEF_BENCH(return new Memset16Bench(600, 800);)
|
||||
DEF_BENCH(return new Memset16Bench(800, 1000);)
|
||||
DEF_BENCH(return new Memset16Bench(1000, 2000);)
|
||||
DEF_BENCH(return new Memset16Bench(2000, 3000);)
|
||||
DEF_BENCH(return new Memset16Bench(3000, 4000);)
|
||||
DEF_BENCH(return new Memset16Bench(4000, 5000);)
|
||||
template <> void MemsetBench<uint32_t, true>::onDraw(const int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
memsetT<uint32_t>(fBuffer.get(), 0xFACEB004, fN);
|
||||
}
|
||||
}
|
||||
|
||||
template <> void MemsetBench<uint16_t, true>::onDraw(const int loops, SkCanvas*) {
|
||||
for (int i = 0; i < 1000*loops; i++) {
|
||||
memsetT<uint16_t>(fBuffer.get(), 0x4973, fN);
|
||||
}
|
||||
}
|
||||
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, true>(100000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint32_t, false>(100000)));
|
||||
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(1)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(10)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(100)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(1000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(10000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, true>(100000)));
|
||||
DEF_BENCH(return (new MemsetBench<uint16_t, false>(100000)));
|
||||
|
|
|
@ -12,12 +12,31 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Determined empirically using bench/MemsetBench.cpp on a Nexus 7, Nexus 9, and desktop.
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 || defined(SK_ARM_HAS_NEON)
|
||||
// Platforms where we can assume an autovectorizer will give us a good inline memset.
|
||||
#define SK_SMALL_MEMSET 1000
|
||||
#else
|
||||
// Platforms like Chrome on ARMv7 that don't typically compile with NEON globally.
|
||||
#define SK_SMALL_MEMSET 10
|
||||
#endif
|
||||
|
||||
|
||||
/** Similar to memset(), but it assigns a 16bit value into the buffer.
|
||||
@param buffer The memory to have value copied into it
|
||||
@param value The 16bit value to be copied into buffer
|
||||
@param count The number of times value should be copied into the buffer.
|
||||
*/
|
||||
void sk_memset16(uint16_t dst[], uint16_t value, int count);
|
||||
void sk_memset16_large(uint16_t dst[], uint16_t value, int count);
|
||||
inline void sk_memset16(uint16_t dst[], uint16_t value, int count) {
|
||||
if (count <= SK_SMALL_MEMSET) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
dst[i] = value;
|
||||
}
|
||||
} else {
|
||||
sk_memset16_large(dst, value, count);
|
||||
}
|
||||
}
|
||||
typedef void (*SkMemset16Proc)(uint16_t dst[], uint16_t value, int count);
|
||||
SkMemset16Proc SkMemset16GetPlatformProc();
|
||||
|
||||
|
@ -26,10 +45,22 @@ SkMemset16Proc SkMemset16GetPlatformProc();
|
|||
@param value The 32bit value to be copied into buffer
|
||||
@param count The number of times value should be copied into the buffer.
|
||||
*/
|
||||
void sk_memset32(uint32_t dst[], uint32_t value, int count);
|
||||
void sk_memset32_large(uint32_t dst[], uint32_t value, int count);
|
||||
inline void sk_memset32(uint32_t dst[], uint32_t value, int count) {
|
||||
if (count <= SK_SMALL_MEMSET) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
dst[i] = value;
|
||||
}
|
||||
} else {
|
||||
sk_memset32_large(dst, value, count);
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count);
|
||||
SkMemset32Proc SkMemset32GetPlatformProc();
|
||||
|
||||
#undef SK_SMALL_MEMSET
|
||||
|
||||
/** Similar to memcpy(), but it copies count 32bit values from src to dst.
|
||||
@param dst The memory to have value copied into it
|
||||
@param src The memory to have value copied from it
|
||||
|
|
|
@ -134,12 +134,12 @@ SkMemcpy32Proc choose_memcpy32() {
|
|||
|
||||
} // namespace
|
||||
|
||||
void sk_memset16(uint16_t dst[], uint16_t value, int count) {
|
||||
void sk_memset16_large(uint16_t dst[], uint16_t value, int count) {
|
||||
SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset16Proc, proc, choose_memset16);
|
||||
proc.get()(dst, value, count);
|
||||
}
|
||||
|
||||
void sk_memset32(uint32_t dst[], uint32_t value, int count) {
|
||||
void sk_memset32_large(uint32_t dst[], uint32_t value, int count) {
|
||||
SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset32Proc, proc, choose_memset32);
|
||||
proc.get()(dst, value, count);
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче