зеркало из https://github.com/mozilla/gecko-dev.git
338 строки
12 KiB
C++
338 строки
12 KiB
C++
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||
|
|
||
|
#include "Swizzle.h"
|
||
|
|
||
|
#include <arm_neon.h>
|
||
|
|
||
|
namespace mozilla {
|
||
|
namespace gfx {
|
||
|
|
||
|
// Load 1-3 pixels into a 4 pixel vector.
|
||
|
static MOZ_ALWAYS_INLINE uint16x8_t
|
||
|
LoadRemainder_NEON(const uint8_t* aSrc, size_t aLength)
|
||
|
{
|
||
|
uint16x8_t px;
|
||
|
if (aLength >= 2) {
|
||
|
// Load first 2 pixels
|
||
|
px =
|
||
|
vreinterpretq_u16_u64(
|
||
|
vld1q_lane_u64(reinterpret_cast<const uint64_t*>(aSrc),
|
||
|
vdupq_n_u64(0), 0));
|
||
|
// Load third pixel
|
||
|
if (aLength >= 3) {
|
||
|
px =
|
||
|
vreinterpretq_u16_u32(
|
||
|
vld1q_lane_u32(reinterpret_cast<const uint32_t*>(aSrc + 2 * 4),
|
||
|
vreinterpretq_u32_u16(px), 2));
|
||
|
}
|
||
|
} else {
|
||
|
// Load single pixel
|
||
|
px =
|
||
|
vreinterpretq_u16_u32(
|
||
|
vld1q_lane_u32(reinterpret_cast<const uint32_t*>(aSrc),
|
||
|
vdupq_n_u32(0), 0));
|
||
|
}
|
||
|
return px;
|
||
|
}
|
||
|
|
||
|
// Store 1-3 pixels from a vector into memory without overwriting.
|
||
|
static MOZ_ALWAYS_INLINE void
|
||
|
StoreRemainder_NEON(uint8_t* aDst, size_t aLength, const uint16x8_t& aSrc)
|
||
|
{
|
||
|
if (aLength >= 2) {
|
||
|
// Store first 2 pixels
|
||
|
vst1q_lane_u64(reinterpret_cast<uint64_t*>(aDst),
|
||
|
vreinterpretq_u64_u16(aSrc), 0);
|
||
|
// Store third pixel
|
||
|
if (aLength >= 3) {
|
||
|
vst1q_lane_u32(reinterpret_cast<uint32_t*>(aDst + 2 * 4),
|
||
|
vreinterpretq_u32_u16(aSrc), 2);
|
||
|
}
|
||
|
} else {
|
||
|
// Store single pixel
|
||
|
vst1q_lane_u32(reinterpret_cast<uint32_t*>(aDst),
|
||
|
vreinterpretq_u32_u16(aSrc), 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Premultiply vector of 4 pixels using splayed math.
|
||
|
template<bool aSwapRB, bool aOpaqueAlpha>
|
||
|
static MOZ_ALWAYS_INLINE uint16x8_t
|
||
|
PremultiplyVector_NEON(const uint16x8_t& aSrc)
|
||
|
{
|
||
|
// Isolate R and B with mask.
|
||
|
const uint16x8_t mask = vdupq_n_u16(0x00FF);
|
||
|
uint16x8_t rb = vandq_u16(aSrc, mask);
|
||
|
// Swap R and B if necessary.
|
||
|
if (aSwapRB) {
|
||
|
rb = vrev32q_u16(rb);
|
||
|
}
|
||
|
// Isolate G and A by shifting down to bottom of word.
|
||
|
uint16x8_t ga = vshrq_n_u16(aSrc, 8);
|
||
|
|
||
|
// Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
|
||
|
uint16x8_t alphas = vtrnq_u16(ga, ga).val[1];
|
||
|
|
||
|
// rb = rb*a + 255; rb += rb >> 8;
|
||
|
rb = vmlaq_u16(mask, rb, alphas);
|
||
|
rb = vsraq_n_u16(rb, rb, 8);
|
||
|
|
||
|
// If format is not opaque, force A to 255 so that A*alpha/255 = alpha
|
||
|
if (!aOpaqueAlpha) {
|
||
|
ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0x00FF0000)));
|
||
|
}
|
||
|
// ga = ga*a + 255; ga += ga >> 8;
|
||
|
ga = vmlaq_u16(mask, ga, alphas);
|
||
|
ga = vsraq_n_u16(ga, ga, 8);
|
||
|
// If format is opaque, force output A to be 255.
|
||
|
if (aOpaqueAlpha) {
|
||
|
ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
|
||
|
}
|
||
|
|
||
|
// Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
|
||
|
return vsriq_n_u16(ga, rb, 8);
|
||
|
}
|
||
|
|
||
|
template<bool aSwapRB, bool aOpaqueAlpha>
|
||
|
void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap,
|
||
|
uint8_t* aDst, int32_t aDstGap,
|
||
|
IntSize aSize)
|
||
|
{
|
||
|
int32_t alignedRow = 4 * (aSize.width & ~3);
|
||
|
int32_t remainder = aSize.width & 3;
|
||
|
// Fold remainder into stride gap.
|
||
|
aSrcGap += 4 * remainder;
|
||
|
aDstGap += 4 * remainder;
|
||
|
|
||
|
for (int32_t height = aSize.height; height > 0; height--) {
|
||
|
// Process all 4-pixel chunks as one vector.
|
||
|
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
|
||
|
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
|
||
|
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||
|
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
|
||
|
aSrc += 4 * 4;
|
||
|
aDst += 4 * 4;
|
||
|
}
|
||
|
|
||
|
// Handle any 1-3 remaining pixels.
|
||
|
if (remainder) {
|
||
|
uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
|
||
|
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||
|
StoreRemainder_NEON(aDst, remainder, px);
|
||
|
}
|
||
|
|
||
|
aSrc += aSrcGap;
|
||
|
aDst += aDstGap;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Force instantiation of premultiply variants here.
|
||
|
template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
template void Premultiply_NEON<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
template void Premultiply_NEON<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
|
||
|
// This generates a table of fixed-point reciprocals representing 1/alpha
|
||
|
// similar to the fallback implementation. However, the reciprocal must
|
||
|
// ultimately be multiplied as an unsigned 9 bit upper part and a signed
|
||
|
// 15 bit lower part to cheaply multiply. Thus, the lower 15 bits of the
|
||
|
// reciprocal is stored 15 bits of the reciprocal are masked off and
|
||
|
// stored in the low word. The upper 9 bits are masked and shifted to fit
|
||
|
// into the high word. These then get independently multiplied with the
|
||
|
// color component and recombined to provide the full recriprocal multiply.
|
||
|
#define UNPREMULQ_NEON(x) ((((0xFF00FFU / (x)) & 0xFF8000U) << 1) | ((0xFF00FFU / (x)) & 0x7FFFU))
|
||
|
#define UNPREMULQ_NEON_2(x) UNPREMULQ_NEON(x), UNPREMULQ_NEON((x) + 1)
|
||
|
#define UNPREMULQ_NEON_4(x) UNPREMULQ_NEON_2(x), UNPREMULQ_NEON_2((x) + 2)
|
||
|
#define UNPREMULQ_NEON_8(x) UNPREMULQ_NEON_4(x), UNPREMULQ_NEON_4((x) + 4)
|
||
|
#define UNPREMULQ_NEON_16(x) UNPREMULQ_NEON_8(x), UNPREMULQ_NEON_8((x) + 8)
|
||
|
#define UNPREMULQ_NEON_32(x) UNPREMULQ_NEON_16(x), UNPREMULQ_NEON_16((x) + 16)
|
||
|
static const uint32_t sUnpremultiplyTable_NEON[256] =
|
||
|
{
|
||
|
0, UNPREMULQ_NEON(1), UNPREMULQ_NEON_2(2), UNPREMULQ_NEON_4(4),
|
||
|
UNPREMULQ_NEON_8(8), UNPREMULQ_NEON_16(16), UNPREMULQ_NEON_32(32),
|
||
|
UNPREMULQ_NEON_32(64), UNPREMULQ_NEON_32(96), UNPREMULQ_NEON_32(128),
|
||
|
UNPREMULQ_NEON_32(160), UNPREMULQ_NEON_32(192), UNPREMULQ_NEON_32(224)
|
||
|
};
|
||
|
|
||
|
// Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
|
||
|
// that avoids doing any actual division.
|
||
|
template<bool aSwapRB>
|
||
|
static MOZ_ALWAYS_INLINE uint16x8_t
|
||
|
UnpremultiplyVector_NEON(const uint16x8_t& aSrc)
|
||
|
{
|
||
|
// Isolate R and B with mask.
|
||
|
uint16x8_t rb = vandq_u16(aSrc, vdupq_n_u16(0x00FF));
|
||
|
// Swap R and B if necessary.
|
||
|
if (aSwapRB) {
|
||
|
rb = vrev32q_u16(rb);
|
||
|
}
|
||
|
|
||
|
// Isolate G and A by shifting down to bottom of word.
|
||
|
uint16x8_t ga = vshrq_n_u16(aSrc, 8);
|
||
|
// Extract the alphas for the 4 pixels from the now isolated words.
|
||
|
int a1 = vgetq_lane_u16(ga, 1);
|
||
|
int a2 = vgetq_lane_u16(ga, 3);
|
||
|
int a3 = vgetq_lane_u16(ga, 5);
|
||
|
int a4 = vgetq_lane_u16(ga, 7);
|
||
|
|
||
|
// First load all of the interleaved low and high portions of the reciprocals
|
||
|
// and combine them a single vector as lo1 hi1 lo2 hi2 lo3 hi3 lo4 hi4
|
||
|
uint16x8_t q1234 =
|
||
|
vreinterpretq_u16_u32(
|
||
|
vld1q_lane_u32(&sUnpremultiplyTable_NEON[a4],
|
||
|
vld1q_lane_u32(&sUnpremultiplyTable_NEON[a3],
|
||
|
vld1q_lane_u32(&sUnpremultiplyTable_NEON[a2],
|
||
|
vld1q_lane_u32(&sUnpremultiplyTable_NEON[a1],
|
||
|
vdupq_n_u32(0), 0), 1), 2), 3));
|
||
|
// Transpose the interleaved low/high portions so that we produce
|
||
|
// two separate duplicated vectors for the low and high portions respectively:
|
||
|
// lo1 lo1 lo2 lo2 lo3 lo3 lo4 lo4 and hi1 hi1 hi2 hi2 hi3 hi3 hi4 hi4
|
||
|
uint16x8x2_t q1234lohi = vtrnq_u16(q1234, q1234);
|
||
|
|
||
|
// VQDMULH is a signed multiply that doubles (*2) the result, then takes the high word.
|
||
|
// To work around the signedness and the doubling, the low portion of the reciprocal only
|
||
|
// stores the lower 15 bits, which fits in a signed 16 bit integer. The high 9 bit portion
|
||
|
// is effectively also doubled by 2 as a side-effect of being shifted for storage. Thus the
|
||
|
// output scale of doing a normal multiply by the high portion and the VQDMULH by the low
|
||
|
// portion are both doubled and can be safely added together. The resulting sum just needs
|
||
|
// to be halved (via VHADD) to thus cancel out the doubling. All this combines to produce
|
||
|
// a reciprocal multiply of the form:
|
||
|
// rb = ((rb * hi) + ((rb * lo * 2) >> 16)) / 2
|
||
|
rb =
|
||
|
vhaddq_u16(
|
||
|
vmulq_u16(rb, q1234lohi.val[1]),
|
||
|
vreinterpretq_u16_s16(
|
||
|
vqdmulhq_s16(vreinterpretq_s16_u16(rb),
|
||
|
vreinterpretq_s16_u16(q1234lohi.val[0]))));
|
||
|
|
||
|
// ga = ((ga * hi) + ((ga * lo * 2) >> 16)) / 2
|
||
|
ga =
|
||
|
vhaddq_u16(
|
||
|
vmulq_u16(ga, q1234lohi.val[1]),
|
||
|
vreinterpretq_u16_s16(
|
||
|
vqdmulhq_s16(vreinterpretq_s16_u16(ga),
|
||
|
vreinterpretq_s16_u16(q1234lohi.val[0]))));
|
||
|
|
||
|
// Combine to the final pixel with ((rb | (ga << 8)) & ~0xFF000000) | (aSrc & 0xFF000000),
|
||
|
// which inserts back in the original alpha value unchanged.
|
||
|
return vbslq_u16(vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)),
|
||
|
aSrc,
|
||
|
vsliq_n_u16(rb, ga, 8));
|
||
|
}
|
||
|
|
||
|
template<bool aSwapRB>
|
||
|
void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap,
|
||
|
uint8_t* aDst, int32_t aDstGap,
|
||
|
IntSize aSize)
|
||
|
{
|
||
|
int32_t alignedRow = 4 * (aSize.width & ~3);
|
||
|
int32_t remainder = aSize.width & 3;
|
||
|
// Fold remainder into stride gap.
|
||
|
aSrcGap += 4 * remainder;
|
||
|
aDstGap += 4 * remainder;
|
||
|
|
||
|
for (int32_t height = aSize.height; height > 0; height--) {
|
||
|
// Process all 4-pixel chunks as one vector.
|
||
|
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
|
||
|
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
|
||
|
px = UnpremultiplyVector_NEON<aSwapRB>(px);
|
||
|
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
|
||
|
aSrc += 4 * 4;
|
||
|
aDst += 4 * 4;
|
||
|
}
|
||
|
|
||
|
// Handle any 1-3 remaining pixels.
|
||
|
if (remainder) {
|
||
|
uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
|
||
|
px = UnpremultiplyVector_NEON<aSwapRB>(px);
|
||
|
StoreRemainder_NEON(aDst, remainder, px);
|
||
|
}
|
||
|
|
||
|
aSrc += aSrcGap;
|
||
|
aDst += aDstGap;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Force instantiation of unpremultiply variants here.
|
||
|
template void Unpremultiply_NEON<false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
|
||
|
// Swizzle a vector of 4 pixels providing swaps and opaquifying.
|
||
|
template<bool aSwapRB, bool aOpaqueAlpha>
|
||
|
MOZ_ALWAYS_INLINE uint16x8_t
|
||
|
SwizzleVector_NEON(const uint16x8_t& aSrc)
|
||
|
{
|
||
|
// Swap R and B, then add to G and A (forced to 255):
|
||
|
// (((src>>16) | (src << 16)) & 0x00FF00FF) |
|
||
|
// ((src | 0xFF000000) & ~0x00FF00FF)
|
||
|
return vbslq_u16(vdupq_n_u16(0x00FF),
|
||
|
vrev32q_u16(aSrc),
|
||
|
aOpaqueAlpha ?
|
||
|
vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000))) :
|
||
|
aSrc);
|
||
|
}
|
||
|
|
||
|
#if 0
|
||
|
// These specializations currently do not profile faster than the generic versions,
|
||
|
// so disable them for now.
|
||
|
|
||
|
// Optimized implementations for when there is no R and B swap.
|
||
|
template<>
|
||
|
MOZ_ALWAYS_INLINE uint16x8_t
|
||
|
SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
|
||
|
{
|
||
|
// Force alpha to 255.
|
||
|
return vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
|
||
|
}
|
||
|
|
||
|
template<>
|
||
|
MOZ_ALWAYS_INLINE uint16x8_t
|
||
|
SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
|
||
|
{
|
||
|
return aSrc;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
template<bool aSwapRB, bool aOpaqueAlpha>
|
||
|
void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap,
|
||
|
uint8_t* aDst, int32_t aDstGap,
|
||
|
IntSize aSize)
|
||
|
{
|
||
|
int32_t alignedRow = 4 * (aSize.width & ~3);
|
||
|
int32_t remainder = aSize.width & 3;
|
||
|
// Fold remainder into stride gap.
|
||
|
aSrcGap += 4 * remainder;
|
||
|
aDstGap += 4 * remainder;
|
||
|
|
||
|
for (int32_t height = aSize.height; height > 0; height--) {
|
||
|
// Process all 4-pixel chunks as one vector.
|
||
|
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
|
||
|
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
|
||
|
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||
|
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
|
||
|
aSrc += 4 * 4;
|
||
|
aDst += 4 * 4;
|
||
|
}
|
||
|
|
||
|
// Handle any 1-3 remaining pixels.
|
||
|
if (remainder) {
|
||
|
uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
|
||
|
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||
|
StoreRemainder_NEON(aDst, remainder, px);
|
||
|
}
|
||
|
|
||
|
aSrc += aSrcGap;
|
||
|
aDst += aDstGap;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Force instantiation of swizzle variants here.
|
||
|
template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||
|
|
||
|
} // namespace gfx
|
||
|
} // namespace mozilla
|
||
|
|