From d774b3ffc7ca610a649bd36b3d47e2452c51da0e Mon Sep 17 00:00:00 2001 From: Andrew Osmond Date: Tue, 24 Sep 2019 13:31:26 +0000 Subject: [PATCH] Bug 1551088 - Part 3. Add SSSE3 and AVX2 implementations of unpacking. r=lsalzman These variants perform significantly faster than the C implementations according to local testing and that in treeherder. Image decoding is as much as 40% faster in the most simple cases (solid green PNG image). Differential Revision: https://phabricator.services.mozilla.com/D46446 --HG-- extra : moz-landing-system : lando --- gfx/2d/Swizzle.cpp | 32 ++++++++++++++++ gfx/2d/SwizzleAVX2.cpp | 84 +++++++++++++++++++++++++++++++++++++++++ gfx/2d/SwizzleSSSE3.cpp | 66 ++++++++++++++++++++++++++++++++ gfx/2d/moz.build | 4 ++ 4 files changed, 186 insertions(+) create mode 100644 gfx/2d/SwizzleAVX2.cpp create mode 100644 gfx/2d/SwizzleSSSE3.cpp diff --git a/gfx/2d/Swizzle.cpp b/gfx/2d/Swizzle.cpp index 2804b8a0d666..8898ebebf651 100644 --- a/gfx/2d/Swizzle.cpp +++ b/gfx/2d/Swizzle.cpp @@ -151,6 +151,20 @@ void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t); SwizzleRow_SSE2) +template +void UnpackRowRGB24_SSSE3(const uint8_t*, uint8_t*, int32_t); + +#define UNPACK_ROW_RGB_SSSE3(aDstFormat) \ + FORMAT_CASE_ROW(SurfaceFormat::R8G8B8, aDstFormat, \ + UnpackRowRGB24_SSSE3) + +template +void UnpackRowRGB24_AVX2(const uint8_t*, uint8_t*, int32_t); + +#define UNPACK_ROW_RGB_AVX2(aDstFormat) \ + FORMAT_CASE_ROW(SurfaceFormat::R8G8B8, aDstFormat, \ + UnpackRowRGB24_AVX2) + #endif #ifdef USE_NEON @@ -991,6 +1005,24 @@ bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride, SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) { #ifdef USE_SSE2 + if (mozilla::supports_avx2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + UNPACK_ROW_RGB_AVX2(SurfaceFormat::R8G8B8X8) + UNPACK_ROW_RGB_AVX2(SurfaceFormat::R8G8B8A8) + UNPACK_ROW_RGB_AVX2(SurfaceFormat::B8G8R8X8) + UNPACK_ROW_RGB_AVX2(SurfaceFormat::B8G8R8A8) + default: + break; + } + + if (mozilla::supports_ssse3()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + UNPACK_ROW_RGB_SSSE3(SurfaceFormat::R8G8B8X8) + UNPACK_ROW_RGB_SSSE3(SurfaceFormat::R8G8B8A8) + UNPACK_ROW_RGB_SSSE3(SurfaceFormat::B8G8R8X8) + UNPACK_ROW_RGB_SSSE3(SurfaceFormat::B8G8R8A8) + default: + break; + } + if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8) SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8) diff --git a/gfx/2d/SwizzleAVX2.cpp b/gfx/2d/SwizzleAVX2.cpp new file mode 100644 index 000000000000..ff334e31dca4 --- /dev/null +++ b/gfx/2d/SwizzleAVX2.cpp @@ -0,0 +1,84 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "Swizzle.h" + +#include +#include + +namespace mozilla { +namespace gfx { + +template +void UnpackRowRGB24_SSSE3(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength); + +template +void UnpackRowRGB24_AVX2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { + // Because this implementation will read an additional 8 bytes of data that + // is ignored and masked over, we cannot use the accelerated version for the + // last 1-8 bytes to guarantee we don't access memory outside the buffer. + if (aLength < 9) { + UnpackRowRGB24_SSSE3(aSrc, aDst, aLength); + return; + } + + // Because we are expanding, we can only process the data back to front in + // case we are performing this in place. + int32_t alignedRow = (aLength - 1) & ~7; + int32_t remainder = aLength - alignedRow; + + const uint8_t* src = aSrc + alignedRow * 3; + uint8_t* dst = aDst + alignedRow * 4; + + // Handle any 1-8 remaining pixels. + UnpackRowRGB24_SSSE3(src, dst, remainder); + + // Used to shuffle the two final 32-bit words which we ignore into the last + // 32-bit word of each 128-bit lane, such that + // RGBR GBRG BRGB RGBR GBRG BRGB RGBR GBRG + // BRGB RGBR GBRG BRGB ZZZZ ZZZZ ZZZZ ZZZZ + // becomes + // RGBR GBRG BRGB RGBR GBRG BRGB ZZZZ ZZZZ + // RGBR GBRG BRGB RGBR GBRG BRGB ZZZZ ZZZZ + const __m256i discardMask = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0); + + // Used to shuffle 8-bit words within a 128-bit lane, such that we transform + // RGBR GBRG BRGB RGBR GBRG BRGB ZZZZ ZZZZ + // into + // RGBZ RGBZ RGBZ RGBZ RGBZ RGBZ RGBZ RGBZ + // or + // BGRZ BGRZ BGRZ BGRZ BGRZ BGRZ BGRZ BGRZ + const __m256i colorMask = + aSwapRB ? _mm256_set_epi8(15, 9, 10, 11, 14, 6, 7, 8, 13, 3, 4, 5, 12, 0, + 1, 2, 15, 9, 10, 11, 14, 6, 7, 8, 13, 3, 4, 5, + 12, 0, 1, 2) + : _mm256_set_epi8(15, 11, 10, 9, 14, 8, 7, 6, 13, 5, 4, 3, 12, 2, + 1, 0, 15, 11, 10, 9, 14, 8, 7, 6, 13, 5, 4, 3, + 12, 2, 1, 0); + + // Used to transform RGBZ/BGRZ to RGBX/BGRX, or force the alpha opaque. + const __m256i alphaMask = _mm256_set1_epi32(0xFF000000); + + // Process all 8-pixel chunks as one vector. + src -= 8 * 3; + dst -= 8 * 4; + while (src >= aSrc) { + __m256i px = _mm256_loadu_si256(reinterpret_cast(src)); + px = _mm256_permutevar8x32_epi32(px, discardMask); + px = _mm256_shuffle_epi8(px, colorMask); + px = _mm256_or_si256(px, alphaMask); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), px); + src -= 8 * 3; + dst -= 8 * 4; + } +} + +// Force instantiation of swizzle variants here. +template void UnpackRowRGB24_AVX2(const uint8_t*, uint8_t*, int32_t); +template void UnpackRowRGB24_AVX2(const uint8_t*, uint8_t*, int32_t); + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/2d/SwizzleSSSE3.cpp b/gfx/2d/SwizzleSSSE3.cpp new file mode 100644 index 000000000000..13cfcb83926f --- /dev/null +++ b/gfx/2d/SwizzleSSSE3.cpp @@ -0,0 +1,66 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "Swizzle.h" + +#include +#include + +namespace mozilla { +namespace gfx { + +template +void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength); + +template +void UnpackRowRGB24_SSSE3(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { + // Because this implementation will read an additional 4 bytes of data that + // is ignored and masked over, we cannot use the accelerated version for the + // last 1-4 bytes to guarantee we don't access memory outside the buffer. + if (aLength < 5) { + UnpackRowRGB24(aSrc, aDst, aLength); + return; + } + + // Because we are expanding, we can only process the data back to front in + // case we are performing this in place. + int32_t alignedRow = (aLength - 1) & ~3; + int32_t remainder = aLength - alignedRow; + + const uint8_t* src = aSrc + alignedRow * 3; + uint8_t* dst = aDst + alignedRow * 4; + + // Handle 1-4 remaining pixels. + UnpackRowRGB24(src, dst, remainder); + + __m128i mask; + if (aSwapRB) { + mask = _mm_set_epi8(15, 9, 10, 11, 14, 6, 7, 8, 13, 3, 4, 5, 12, 0, 1, 2); + } else { + mask = _mm_set_epi8(15, 11, 10, 9, 14, 8, 7, 6, 13, 5, 4, 3, 12, 2, 1, 0); + } + + __m128i alpha = _mm_set1_epi32(0xFF000000); + + // Process all 4-pixel chunks as one vector. + src -= 4 * 3; + dst -= 4 * 4; + while (src >= aSrc) { + __m128i px = _mm_loadu_si128(reinterpret_cast(src)); + px = _mm_shuffle_epi8(px, mask); + px = _mm_or_si128(px, alpha); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), px); + src -= 4 * 3; + dst -= 4 * 4; + } +} + +// Force instantiation of swizzle variants here. +template void UnpackRowRGB24_SSSE3(const uint8_t*, uint8_t*, int32_t); +template void UnpackRowRGB24_SSSE3(const uint8_t*, uint8_t*, int32_t); + +} // namespace gfx +} // namespace mozilla diff --git a/gfx/2d/moz.build b/gfx/2d/moz.build index b9e6e3a7a6a8..627cb6172ee6 100644 --- a/gfx/2d/moz.build +++ b/gfx/2d/moz.build @@ -145,7 +145,9 @@ if CONFIG['INTEL_ARCHITECTURE']: 'FilterProcessingSSE2.cpp', 'ImageScalingSSE2.cpp', 'ssse3-scaler.c', + 'SwizzleAVX2.cpp', 'SwizzleSSE2.cpp', + 'SwizzleSSSE3.cpp', ] DEFINES['USE_SSE2'] = True # The file uses SSE2 intrinsics, so it needs special compile flags on some @@ -153,7 +155,9 @@ if CONFIG['INTEL_ARCHITECTURE']: SOURCES['BlurSSE2.cpp'].flags += CONFIG['SSE2_FLAGS'] SOURCES['FilterProcessingSSE2.cpp'].flags += CONFIG['SSE2_FLAGS'] SOURCES['ImageScalingSSE2.cpp'].flags += CONFIG['SSE2_FLAGS'] + SOURCES['SwizzleAVX2.cpp'].flags += ['-mavx2'] SOURCES['SwizzleSSE2.cpp'].flags += CONFIG['SSE2_FLAGS'] + SOURCES['SwizzleSSSE3.cpp'].flags += CONFIG['SSSE3_FLAGS'] SOURCES['ssse3-scaler.c'].flags += CONFIG['SSSE3_FLAGS'] elif CONFIG['CPU_ARCH'].startswith('mips'): SOURCES += [