зеркало из https://github.com/mozilla/moz-skia.git
(SSE2) acceleration for rectangular opaque erases.
15% speedup for rectangles < 31 px wide, 5% for larger. http://codereview.appspot.com/5843050/ git-svn-id: http://skia.googlecode.com/svn/trunk@3423 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
Родитель
26936d071f
Коммит
8dd90a926a
|
@ -40,6 +40,7 @@
|
||||||
'../src/opts/opts_check_SSE2.cpp',
|
'../src/opts/opts_check_SSE2.cpp',
|
||||||
'../src/opts/SkBitmapProcState_opts_SSE2.cpp',
|
'../src/opts/SkBitmapProcState_opts_SSE2.cpp',
|
||||||
'../src/opts/SkBlitRow_opts_SSE2.cpp',
|
'../src/opts/SkBlitRow_opts_SSE2.cpp',
|
||||||
|
'../src/opts/SkBlitRect_opts_SSE2.cpp',
|
||||||
'../src/opts/SkUtils_opts_SSE2.cpp',
|
'../src/opts/SkUtils_opts_SSE2.cpp',
|
||||||
],
|
],
|
||||||
'dependencies': [
|
'dependencies': [
|
||||||
|
|
|
@ -36,13 +36,6 @@ public:
|
||||||
const SkPMColor* src,
|
const SkPMColor* src,
|
||||||
int count, U8CPU alpha, int x, int y);
|
int count, U8CPU alpha, int x, int y);
|
||||||
|
|
||||||
/** Function pointer that blends a single color with a row of 32-bit colors
|
|
||||||
onto a 32-bit destination
|
|
||||||
*/
|
|
||||||
typedef void (*ColorProc)(SkPMColor* dst, const SkPMColor* src, int count,
|
|
||||||
SkPMColor color);
|
|
||||||
|
|
||||||
//! Public entry-point to return a blit function ptr
|
|
||||||
static Proc Factory(unsigned flags, SkBitmap::Config);
|
static Proc Factory(unsigned flags, SkBitmap::Config);
|
||||||
|
|
||||||
///////////// D32 version
|
///////////// D32 version
|
||||||
|
@ -64,6 +57,12 @@ public:
|
||||||
|
|
||||||
static Proc32 Factory32(unsigned flags32);
|
static Proc32 Factory32(unsigned flags32);
|
||||||
|
|
||||||
|
/** Function pointer that blends a single color with a row of 32-bit colors
|
||||||
|
onto a 32-bit destination
|
||||||
|
*/
|
||||||
|
typedef void (*ColorProc)(SkPMColor* dst, const SkPMColor* src, int count,
|
||||||
|
SkPMColor color);
|
||||||
|
|
||||||
/** Blend a single color onto a row of S32 pixels, writing the result
|
/** Blend a single color onto a row of S32 pixels, writing the result
|
||||||
into a row of D32 pixels. src and dst may be the same memory, but
|
into a row of D32 pixels. src and dst may be the same memory, but
|
||||||
if they are not, they may not overlap.
|
if they are not, they may not overlap.
|
||||||
|
@ -71,8 +70,20 @@ public:
|
||||||
static void Color32(SkPMColor dst[], const SkPMColor src[],
|
static void Color32(SkPMColor dst[], const SkPMColor src[],
|
||||||
int count, SkPMColor color);
|
int count, SkPMColor color);
|
||||||
|
|
||||||
|
//! Public entry-point to return a blit function ptr
|
||||||
static ColorProc ColorProcFactory();
|
static ColorProc ColorProcFactory();
|
||||||
|
|
||||||
|
/** Function pointer that blends a single color onto a 32-bit rectangle. */
|
||||||
|
typedef void (*ColorRectProc)(SkPMColor* dst, int width, int height,
|
||||||
|
size_t rowBytes, SkPMColor color);
|
||||||
|
|
||||||
|
/** Blend a single color into a rectangle of D32 pixels. */
|
||||||
|
static void ColorRect32(SkPMColor* dst, int width, int height,
|
||||||
|
size_t rowBytes, SkPMColor color);
|
||||||
|
|
||||||
|
//! Public entry-point to return a blit function ptr
|
||||||
|
static ColorRectProc ColorRectProcFactory();
|
||||||
|
|
||||||
/** These static functions are called by the Factory and Factory32
|
/** These static functions are called by the Factory and Factory32
|
||||||
functions, and should return either NULL, or a
|
functions, and should return either NULL, or a
|
||||||
platform-specific function-ptr to be used in place of the
|
platform-specific function-ptr to be used in place of the
|
||||||
|
|
|
@ -12,6 +12,8 @@
|
||||||
|
|
||||||
#define UNROLL
|
#define UNROLL
|
||||||
|
|
||||||
|
SkBlitRow::ColorRectProc PlatformColorRectProcFactory();
|
||||||
|
|
||||||
static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
|
||||||
const SkPMColor* SK_RESTRICT src,
|
const SkPMColor* SK_RESTRICT src,
|
||||||
int count, U8CPU alpha) {
|
int count, U8CPU alpha) {
|
||||||
|
@ -178,3 +180,21 @@ void SkBlitRow::Color32(SkPMColor* SK_RESTRICT dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SkBlitRow::ColorRect32(SkPMColor* dst, int width, int height,
|
||||||
|
size_t rowBytes, SkPMColor color) {
|
||||||
|
SkBlitRow::ColorProc proc = SkBlitRow::ColorProcFactory();
|
||||||
|
while (--height >= 0) {
|
||||||
|
(*proc)(dst, dst, width, color);
|
||||||
|
dst = (SkPMColor*) ((char*)dst + rowBytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SkBlitRow::ColorRectProc SkBlitRow::ColorRectProcFactory() {
|
||||||
|
SkBlitRow::ColorRectProc proc = PlatformColorRectProcFactory();
|
||||||
|
if (NULL == proc) {
|
||||||
|
proc = ColorRect32;
|
||||||
|
}
|
||||||
|
SkASSERT(proc);
|
||||||
|
return proc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,6 +53,7 @@ SkARGB32_Blitter::SkARGB32_Blitter(const SkBitmap& device, const SkPaint& paint)
|
||||||
|
|
||||||
fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
|
fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
|
||||||
fColor32Proc = SkBlitRow::ColorProcFactory();
|
fColor32Proc = SkBlitRow::ColorProcFactory();
|
||||||
|
fColorRect32Proc = SkBlitRow::ColorRectProcFactory();
|
||||||
}
|
}
|
||||||
|
|
||||||
const SkBitmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) {
|
const SkBitmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) {
|
||||||
|
@ -213,10 +214,14 @@ void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
|
||||||
uint32_t color = fPMColor;
|
uint32_t color = fPMColor;
|
||||||
size_t rowBytes = fDevice.rowBytes();
|
size_t rowBytes = fDevice.rowBytes();
|
||||||
|
|
||||||
while (--height >= 0) {
|
//if (255 == SkGetPackedA32(color)) {
|
||||||
fColor32Proc(device, device, width, color);
|
fColorRect32Proc(device, width, height, rowBytes, color);
|
||||||
device = (uint32_t*)((char*)device + rowBytes);
|
//} else {
|
||||||
}
|
//while (--height >= 0) {
|
||||||
|
//fColor32Proc(device, device, width, color);
|
||||||
|
//device = (uint32_t*)((char*)device + rowBytes);
|
||||||
|
//}
|
||||||
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined _WIN32 && _MSC_VER >= 1300
|
#if defined _WIN32 && _MSC_VER >= 1300
|
||||||
|
|
|
@ -94,6 +94,7 @@ protected:
|
||||||
SkColor fColor;
|
SkColor fColor;
|
||||||
SkPMColor fPMColor;
|
SkPMColor fPMColor;
|
||||||
SkBlitRow::ColorProc fColor32Proc;
|
SkBlitRow::ColorProc fColor32Proc;
|
||||||
|
SkBlitRow::ColorRectProc fColorRect32Proc;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
unsigned fSrcA, fSrcR, fSrcG, fSrcB;
|
unsigned fSrcA, fSrcR, fSrcG, fSrcB;
|
||||||
|
|
|
@ -0,0 +1,133 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2011 Google Inc.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license that can be
|
||||||
|
* found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "SkBlitRect_opts_SSE2.h"
|
||||||
|
#include "SkBlitRow.h"
|
||||||
|
#include "SkColorPriv.h"
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
/** Simple blitting of opaque rectangles less than 31 pixels wide:
|
||||||
|
inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
|
||||||
|
*/
|
||||||
|
void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
|
||||||
|
int width, int height,
|
||||||
|
size_t rowBytes, uint32_t color) {
|
||||||
|
SkASSERT(255 == SkGetPackedA32(color));
|
||||||
|
SkASSERT(width > 0);
|
||||||
|
SkASSERT(width < 31);
|
||||||
|
|
||||||
|
while (--height >= 0) {
|
||||||
|
SkPMColor* dst = destination;
|
||||||
|
int count = width;
|
||||||
|
|
||||||
|
while (count > 4) {
|
||||||
|
*dst++ = color;
|
||||||
|
*dst++ = color;
|
||||||
|
*dst++ = color;
|
||||||
|
*dst++ = color;
|
||||||
|
count -= 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (count > 0) {
|
||||||
|
*dst++ = color;
|
||||||
|
--count;
|
||||||
|
}
|
||||||
|
|
||||||
|
destination = (uint32_t*)((char*)destination + rowBytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
Fast blitting of opaque rectangles at least 31 pixels wide:
|
||||||
|
inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
|
||||||
|
A 31 pixel rectangle is guaranteed to have at least one
|
||||||
|
16-pixel aligned span that can take advantage of mm_store.
|
||||||
|
*/
|
||||||
|
void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
|
||||||
|
int width, int height,
|
||||||
|
size_t rowBytes, uint32_t color) {
|
||||||
|
SkASSERT(255 == SkGetPackedA32(color));
|
||||||
|
SkASSERT(width >= 31);
|
||||||
|
|
||||||
|
__m128i color_wide = _mm_set1_epi32(color);
|
||||||
|
while (--height >= 0) {
|
||||||
|
// Prefetching one row ahead to L1 cache can equal hardware
|
||||||
|
// performance for large/tall rects, but never *beats*
|
||||||
|
// hardware performance.
|
||||||
|
SkPMColor* dst = destination;
|
||||||
|
int count = width;
|
||||||
|
|
||||||
|
while (((size_t)dst) & 0x0F) {
|
||||||
|
*dst++ = color;
|
||||||
|
--count;
|
||||||
|
}
|
||||||
|
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||||
|
|
||||||
|
// Googling suggests _mm_stream is only going to beat _mm_store
|
||||||
|
// for things that wouldn't fit in L2 cache anyway, typically
|
||||||
|
// >500kB, and precisely fill cache lines. For us, with
|
||||||
|
// arrays > 100k elements _mm_stream is still 100%+ slower than
|
||||||
|
// mm_store.
|
||||||
|
|
||||||
|
// Unrolling to count >= 64 is a break-even for most
|
||||||
|
// input patterns; we seem to be saturating the bus and having
|
||||||
|
// low enough overhead at 32.
|
||||||
|
|
||||||
|
while (count >= 32) {
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
count -= 32;
|
||||||
|
}
|
||||||
|
if (count >= 16) {
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
_mm_store_si128(d++, color_wide);
|
||||||
|
count -= 16;
|
||||||
|
}
|
||||||
|
dst = reinterpret_cast<uint32_t*>(d);
|
||||||
|
|
||||||
|
// Unrolling the loop in the Narrow code is a significant performance
|
||||||
|
// gain, but unrolling this loop appears to make no difference in
|
||||||
|
// benchmarks with either mm_store_si128 or individual sets.
|
||||||
|
|
||||||
|
while (count > 0) {
|
||||||
|
*dst++ = color;
|
||||||
|
--count;
|
||||||
|
}
|
||||||
|
|
||||||
|
destination = (uint32_t*)((char*)destination + rowBytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ColorRect32_SSE2(SkPMColor* destination,
|
||||||
|
int width, int height,
|
||||||
|
size_t rowBytes, uint32_t color) {
|
||||||
|
if (0 == height || 0 == width || 0 == color) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unsigned colorA = SkGetPackedA32(color);
|
||||||
|
//if (255 == colorA) {
|
||||||
|
//if (width < 31) {
|
||||||
|
//BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
|
||||||
|
//rowBytes, color);
|
||||||
|
//} else {
|
||||||
|
//BlitRect32_OpaqueWide_SSE2(destination, width, height,
|
||||||
|
//rowBytes, color);
|
||||||
|
//}
|
||||||
|
//} else {
|
||||||
|
SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2011 Google Inc.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license that can be
|
||||||
|
* found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SkBlitRect_opts_SSE2_DEFINED
|
||||||
|
#define SkBlitRect_opts_SSE2_DEFINED
|
||||||
|
|
||||||
|
/*
|
||||||
|
These functions' implementations copy sections of both
|
||||||
|
SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "SkColor.h"
|
||||||
|
|
||||||
|
void ColorRect32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||||
|
int width, int height,
|
||||||
|
size_t rowBytes, uint32_t color);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
|
@ -8,6 +8,9 @@
|
||||||
#include "SkBitmapProcState_opts_SSE2.h"
|
#include "SkBitmapProcState_opts_SSE2.h"
|
||||||
#include "SkBitmapProcState_opts_SSSE3.h"
|
#include "SkBitmapProcState_opts_SSSE3.h"
|
||||||
#include "SkBlitMask.h"
|
#include "SkBlitMask.h"
|
||||||
|
#include "SkBlitRect.h"
|
||||||
|
#include "SkBlitRow.h"
|
||||||
|
#include "SkBlitRect_opts_SSE2.h"
|
||||||
#include "SkBlitRow_opts_SSE2.h"
|
#include "SkBlitRow_opts_SSE2.h"
|
||||||
#include "SkUtils_opts_SSE2.h"
|
#include "SkUtils_opts_SSE2.h"
|
||||||
#include "SkUtils.h"
|
#include "SkUtils.h"
|
||||||
|
@ -209,3 +212,13 @@ SkMemset32Proc SkMemset32GetPlatformProc() {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
|
||||||
|
if (cachedHasSSE2()) {
|
||||||
|
return ColorRect32_SSE2;
|
||||||
|
} else {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче