зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1816576 - Update libyuv to 2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2 r=ng
Differential Revision: https://phabricator.services.mozilla.com/D185940
This commit is contained in:
Родитель
1860dd40f3
Коммит
436a11e04d
|
@ -1,6 +1,6 @@
|
|||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1857
|
||||
Version: 1860
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
|
|
@ -232,6 +232,27 @@ void TransposeWx1_16_C(const uint16_t* src,
|
|||
uint16_t* dst,
|
||||
int dst_stride,
|
||||
int width);
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width);
|
||||
|
||||
void Transpose4x4_32_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width);
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose8x8_32_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -177,9 +177,8 @@ extern "C" {
|
|||
|
||||
// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
|
||||
// caveat: clangcl uses row_win.cc which works.
|
||||
#if !defined(MOZ_PROFILING) && \
|
||||
(defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
|
||||
defined(_MSC_VER))
|
||||
#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
|
||||
defined(_MSC_VER)
|
||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||
// https://code.google.com/p/libyuv/issues/detail?id=517
|
||||
#define HAS_I422ALPHATOARGBROW_SSSE3
|
||||
|
@ -248,9 +247,8 @@ extern "C" {
|
|||
#define HAS_ARGBATTENUATEROW_AVX2
|
||||
#endif
|
||||
|
||||
#if !defined(MOZ_PROFILING) && \
|
||||
(defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
|
||||
defined(_MSC_VER))
|
||||
#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
|
||||
defined(_MSC_VER)
|
||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||
// https://code.google.com/p/libyuv/issues/detail?id=517
|
||||
#define HAS_I422ALPHATOARGBROW_AVX2
|
||||
|
@ -404,9 +402,11 @@ extern "C" {
|
|||
// The following are available for AVX512 clang x86 platforms:
|
||||
// TODO(fbarchard): Port to GCC and Visual C
|
||||
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
|
||||
// TODO(fbarchard): Port MERGEUV to assembly
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512))
|
||||
(defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER))
|
||||
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
||||
#define HAS_MERGEUVROW_AVX512BW
|
||||
#endif
|
||||
|
||||
// The following are available for AVX512 clang x64 platforms:
|
||||
|
@ -2186,6 +2186,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
|
|||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void MergeUVRow_AVX512BW(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void MergeUVRow_NEON(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
|
@ -2206,6 +2210,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
|
|||
const uint8_t* uv_buf,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
|
||||
const uint8_t* uv_buf,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
void MergeUVRow_Any_NEON(const uint8_t* y_buf,
|
||||
const uint8_t* uv_buf,
|
||||
uint8_t* dst_ptr,
|
||||
|
|
|
@ -11,6 +11,6 @@
|
|||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1857
|
||||
#define LIBYUV_VERSION 1860
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
|
|
@ -52,7 +52,7 @@
|
|||
'optimize': 'max', # enable O2 and ltcg.
|
||||
},
|
||||
# Allows libyuv.a redistributable library without external dependencies.
|
||||
# 'standalone_static_library': 1,
|
||||
'standalone_static_library': 1,
|
||||
'conditions': [
|
||||
# Disable -Wunused-parameter
|
||||
['clang == 1', {
|
||||
|
@ -70,11 +70,6 @@
|
|||
'-mfpu=vfpv3-d16',
|
||||
# '-mthumb', # arm32 not thumb
|
||||
],
|
||||
'cflags_mozilla!': [
|
||||
'-mfpu=vfp',
|
||||
'-mfpu=vfpv3',
|
||||
'-mfpu=vfpv3-d16',
|
||||
],
|
||||
'conditions': [
|
||||
# Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
|
||||
['clang == 0 and use_lto == 1', {
|
||||
|
@ -89,9 +84,6 @@
|
|||
'-mfpu=neon',
|
||||
# '-marm', # arm32 not thumb
|
||||
],
|
||||
'cflags_mozilla': [
|
||||
'-mfpu=neon',
|
||||
],
|
||||
}],
|
||||
],
|
||||
}],
|
||||
|
@ -100,15 +92,7 @@
|
|||
'LIBYUV_MSA',
|
||||
],
|
||||
}],
|
||||
['build_with_mozilla == 1', {
|
||||
'defines': [
|
||||
'HAVE_JPEG'
|
||||
],
|
||||
'cflags_mozilla': [
|
||||
'$(MOZ_JPEG_CFLAGS)',
|
||||
],
|
||||
}],
|
||||
['OS != "ios" and libyuv_disable_jpeg != 1 and build_with_mozilla != 1', {
|
||||
['OS != "ios" and libyuv_disable_jpeg != 1', {
|
||||
'defines': [
|
||||
'HAVE_JPEG'
|
||||
],
|
||||
|
|
|
@ -924,6 +924,14 @@ int I422ToNV21(const uint8_t* src_y,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
MergeUVRow = MergeUVRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(halfwidth, 32)) {
|
||||
MergeUVRow = MergeUVRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeUVRow = MergeUVRow_Any_NEON;
|
||||
|
|
|
@ -389,6 +389,14 @@ int ARGBToNV12(const uint8_t* src_argb,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(halfwidth, 32)) {
|
||||
MergeUVRow_ = MergeUVRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_NEON;
|
||||
|
@ -559,6 +567,14 @@ int ARGBToNV21(const uint8_t* src_argb,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(halfwidth, 64)) {
|
||||
MergeUVRow_ = MergeUVRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_NEON;
|
||||
|
@ -726,6 +742,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(halfwidth, 64)) {
|
||||
MergeUVRow_ = MergeUVRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_NEON;
|
||||
|
@ -894,6 +918,14 @@ int ABGRToNV21(const uint8_t* src_abgr,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(halfwidth, 64)) {
|
||||
MergeUVRow_ = MergeUVRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_NEON;
|
||||
|
@ -2921,6 +2953,14 @@ int RAWToJNV21(const uint8_t* src_raw,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(halfwidth, 64)) {
|
||||
MergeUVRow_ = MergeUVRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_NEON;
|
||||
|
|
|
@ -79,9 +79,7 @@ MJpegDecoder::MJpegDecoder()
|
|||
decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
|
||||
// Override standard exit()-based error handler.
|
||||
error_mgr_->base.error_exit = &ErrorHandler;
|
||||
#ifndef DEBUG_MJPEG
|
||||
error_mgr_->base.output_message = &OutputHandler;
|
||||
#endif
|
||||
#endif
|
||||
decompress_struct_->client_data = NULL;
|
||||
source_mgr_->init_source = &init_source;
|
||||
|
@ -465,12 +463,11 @@ void ErrorHandler(j_common_ptr cinfo) {
|
|||
longjmp(mgr->setjmp_buffer, 1);
|
||||
}
|
||||
|
||||
#ifndef DEBUG_MJPEG
|
||||
// Suppress fprintf warnings.
|
||||
void OutputHandler(j_common_ptr cinfo) {
|
||||
(void)cinfo;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HAVE_SETJMP
|
||||
|
||||
void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
|
||||
|
|
|
@ -599,6 +599,14 @@ void MergeUVPlane(const uint8_t* src_u,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
MergeUVRow = MergeUVRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
MergeUVRow = MergeUVRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeUVRow = MergeUVRow_Any_NEON;
|
||||
|
|
|
@ -166,6 +166,63 @@ void TransposeWxH_16_C(const uint16_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src1 = src + src_stride;
|
||||
const uint8_t* src2 = src1 + src_stride;
|
||||
const uint8_t* src3 = src2 + src_stride;
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
int i;
|
||||
for (i = 0; i < width; i += 4) {
|
||||
uint32_t p00 = ((uint32_t*)(src))[0];
|
||||
uint32_t p10 = ((uint32_t*)(src))[1];
|
||||
uint32_t p20 = ((uint32_t*)(src))[2];
|
||||
uint32_t p30 = ((uint32_t*)(src))[3];
|
||||
uint32_t p01 = ((uint32_t*)(src1))[0];
|
||||
uint32_t p11 = ((uint32_t*)(src1))[1];
|
||||
uint32_t p21 = ((uint32_t*)(src1))[2];
|
||||
uint32_t p31 = ((uint32_t*)(src1))[3];
|
||||
uint32_t p02 = ((uint32_t*)(src2))[0];
|
||||
uint32_t p12 = ((uint32_t*)(src2))[1];
|
||||
uint32_t p22 = ((uint32_t*)(src2))[2];
|
||||
uint32_t p32 = ((uint32_t*)(src2))[3];
|
||||
uint32_t p03 = ((uint32_t*)(src3))[0];
|
||||
uint32_t p13 = ((uint32_t*)(src3))[1];
|
||||
uint32_t p23 = ((uint32_t*)(src3))[2];
|
||||
uint32_t p33 = ((uint32_t*)(src3))[3];
|
||||
((uint32_t*)(dst))[0] = p00;
|
||||
((uint32_t*)(dst))[1] = p01;
|
||||
((uint32_t*)(dst))[2] = p02;
|
||||
((uint32_t*)(dst))[3] = p03;
|
||||
((uint32_t*)(dst1))[0] = p10;
|
||||
((uint32_t*)(dst1))[1] = p11;
|
||||
((uint32_t*)(dst1))[2] = p12;
|
||||
((uint32_t*)(dst1))[3] = p13;
|
||||
((uint32_t*)(dst2))[0] = p20;
|
||||
((uint32_t*)(dst2))[1] = p21;
|
||||
((uint32_t*)(dst2))[2] = p22;
|
||||
((uint32_t*)(dst2))[3] = p23;
|
||||
((uint32_t*)(dst3))[0] = p30;
|
||||
((uint32_t*)(dst3))[1] = p31;
|
||||
((uint32_t*)(dst3))[2] = p32;
|
||||
((uint32_t*)(dst3))[3] = p33;
|
||||
src += src_stride * 4; // advance 4 rows
|
||||
src1 += src_stride * 4;
|
||||
src2 += src_stride * 4;
|
||||
src3 += src_stride * 4;
|
||||
dst += 4 * 4; // advance 4 columns
|
||||
dst1 += 4 * 4;
|
||||
dst2 += 4 * 4;
|
||||
dst3 += 4 * 4;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -435,6 +435,45 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
|||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
|
||||
}
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src1 = src + src_stride;
|
||||
const uint8_t* src2 = src1 + src_stride;
|
||||
const uint8_t* src3 = src2 + src_stride;
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
|
||||
"subs %w8, %w8, #4 \n" // w -= 4
|
||||
"st1 {v0.4s}, [%4], 16 \n"
|
||||
"st1 {v1.4s}, [%5], 16 \n"
|
||||
"st1 {v2.4s}, [%6], 16 \n"
|
||||
"st1 {v3.4s}, [%7], 16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(dst1), // %5
|
||||
"+r"(dst2), // %6
|
||||
"+r"(dst3), // %7
|
||||
"+r"(width) // %8
|
||||
: "r"((ptrdiff_t)(src_stride * 4)) // %9
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -571,6 +571,9 @@ ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
|
|||
#ifdef HAS_MERGEUVROW_AVX2
|
||||
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
|
||||
#endif
|
||||
#ifdef HAS_MERGEUVROW_AVX512BW
|
||||
ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
|
||||
#endif
|
||||
#ifdef HAS_MERGEUVROW_NEON
|
||||
ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
|
||||
#endif
|
||||
|
|
|
@ -17,6 +17,8 @@ extern "C" {
|
|||
// This module is for GCC x86 and x64.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
|
||||
|
||||
// Constants for ARGB
|
||||
|
@ -5142,6 +5144,25 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
|
|||
}
|
||||
#endif // HAS_DETILESPLITUVROW_SSSE3
|
||||
|
||||
#ifdef HAS_MERGEUVROW_AVX512BW
|
||||
__attribute__ ((target("avx512vl,avx512bw")))
|
||||
void MergeUVRow_AVX512BW(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
do {
|
||||
const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u));
|
||||
const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8);
|
||||
const __m512i uv = _mm512_or_si512(u, v);
|
||||
_mm512_storeu_epi8(dst_uv, uv);
|
||||
src_u += 32;
|
||||
src_v += 32;
|
||||
dst_uv += 64;
|
||||
width -= 32;
|
||||
} while (width > 0);
|
||||
}
|
||||
#endif // HAS_MERGEUVROW_AVX512BW
|
||||
|
||||
#ifdef HAS_MERGEUVROW_AVX2
|
||||
void MergeUVRow_AVX2(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
|
|
|
@ -2036,8 +2036,8 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
|
|||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
asm volatile(
|
||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
asm volatile(
|
||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
@ -2047,19 +2047,21 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
|
|||
"xvld $xr4, %0, 0 \n\t"
|
||||
"xvld $xr5, %0, 32 \n\t"
|
||||
"xvld $xr6, %0, 64 \n\t"
|
||||
"xvld $xr7, %0, 96 \n\t" // load 32 pixels of ARGB
|
||||
"xvld $xr7, %0, 96 \n\t" // load 32 pixels of
|
||||
// ARGB
|
||||
"xvor.v $xr12, $xr3, $xr3 \n\t"
|
||||
"xvor.v $xr13, $xr3, $xr3 \n\t"
|
||||
"addi.d %2, %2, -32 \n\t" // 32 processed per loop.
|
||||
"xvpickev.b $xr8, $xr5, $xr4 \n\t" //BR
|
||||
"addi.d %2, %2, -32 \n\t" // 32 processed per
|
||||
// loop.
|
||||
"xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR
|
||||
"xvpickev.b $xr10, $xr7, $xr6 \n\t"
|
||||
"xvpickod.b $xr9, $xr5, $xr4 \n\t" //GA
|
||||
"xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA
|
||||
"xvpickod.b $xr11, $xr7, $xr6 \n\t"
|
||||
"xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" //B
|
||||
"xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B
|
||||
"xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t"
|
||||
"xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" //G
|
||||
"xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G
|
||||
"xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t"
|
||||
"xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" //R
|
||||
"xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R
|
||||
"xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t"
|
||||
"addi.d %0, %0, 128 \n\t"
|
||||
"xvpickod.b $xr10, $xr13, $xr12 \n\t"
|
||||
|
@ -2067,13 +2069,11 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
|
|||
"xvst $xr11, %1, 0 \n\t"
|
||||
"addi.d %1, %1, 32 \n\t"
|
||||
"bnez %2, 1b \n\t"
|
||||
: "+&r"(src_argb), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants),
|
||||
"r"(shuff)
|
||||
: "memory"
|
||||
);
|
||||
: "+&r"(src_argb), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants), "r"(shuff)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
|
@ -2098,8 +2098,8 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
|
|||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
asm volatile(
|
||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
asm volatile(
|
||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
@ -2109,19 +2109,21 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
|
|||
"xvld $xr4, %0, 0 \n\t"
|
||||
"xvld $xr5, %0, 32 \n\t"
|
||||
"xvld $xr6, %0, 64 \n\t"
|
||||
"xvld $xr7, %0, 96 \n\t" // load 32 pixels of RGBA
|
||||
"xvld $xr7, %0, 96 \n\t" // load 32 pixels of
|
||||
// RGBA
|
||||
"xvor.v $xr12, $xr3, $xr3 \n\t"
|
||||
"xvor.v $xr13, $xr3, $xr3 \n\t"
|
||||
"addi.d %2, %2, -32 \n\t" // 32 processed per loop.
|
||||
"xvpickev.b $xr8, $xr5, $xr4 \n\t" //AG
|
||||
"addi.d %2, %2, -32 \n\t" // 32 processed per
|
||||
// loop.
|
||||
"xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG
|
||||
"xvpickev.b $xr10, $xr7, $xr6 \n\t"
|
||||
"xvpickod.b $xr9, $xr5, $xr4 \n\t" //BR
|
||||
"xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR
|
||||
"xvpickod.b $xr11, $xr7, $xr6 \n\t"
|
||||
"xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" //B
|
||||
"xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B
|
||||
"xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t"
|
||||
"xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" //G
|
||||
"xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G
|
||||
"xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t"
|
||||
"xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" //R
|
||||
"xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R
|
||||
"xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t"
|
||||
"addi.d %0, %0, 128 \n\t"
|
||||
"xvpickod.b $xr10, $xr13, $xr12 \n\t"
|
||||
|
@ -2129,13 +2131,11 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
|
|||
"xvst $xr11, %1, 0 \n\t"
|
||||
"addi.d %1, %1, 32 \n\t"
|
||||
"bnez %2, 1b \n\t"
|
||||
: "+&r"(src_rgba), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants),
|
||||
"r"(shuff)
|
||||
: "memory"
|
||||
);
|
||||
: "+&r"(src_rgba), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants), "r"(shuff)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
|
@ -2151,18 +2151,19 @@ void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
|||
}
|
||||
|
||||
static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
int8_t shuff[128] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
|
||||
0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
|
||||
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
|
||||
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
|
||||
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
||||
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
|
||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||
asm volatile(
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
int8_t shuff[128] = {
|
||||
0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
|
||||
0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
|
||||
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
|
||||
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
|
||||
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
||||
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
|
||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||
asm volatile(
|
||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
@ -2174,23 +2175,25 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
|
|||
"1: \n\t"
|
||||
"xvld $xr8, %0, 0 \n\t"
|
||||
"xvld $xr9, %0, 32 \n\t"
|
||||
"xvld $xr10, %0, 64 \n\t" // load 32 pixels of RGB
|
||||
"xvld $xr10, %0, 64 \n\t" // load 32 pixels of
|
||||
// RGB
|
||||
"xvor.v $xr12, $xr3, $xr3 \n\t"
|
||||
"xvor.v $xr13, $xr3, $xr3 \n\t"
|
||||
"xvor.v $xr11, $xr9, $xr9 \n\t"
|
||||
"addi.d %2, %2, -32 \n\t" // 32 processed per loop.
|
||||
"xvpermi.q $xr9, $xr8, 0x30 \n\t" //src0
|
||||
"xvpermi.q $xr8, $xr10, 0x03 \n\t" //src1
|
||||
"xvpermi.q $xr10, $xr11, 0x30 \n\t" //src2
|
||||
"addi.d %2, %2, -32 \n\t" // 32 processed per
|
||||
// loop.
|
||||
"xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0
|
||||
"xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1
|
||||
"xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2
|
||||
"xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t"
|
||||
"xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t"
|
||||
"xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t"
|
||||
"xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t"
|
||||
"xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" //G
|
||||
"xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G
|
||||
"xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t"
|
||||
"xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" //B
|
||||
"xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B
|
||||
"xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t"
|
||||
"xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" //R
|
||||
"xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R
|
||||
"xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t"
|
||||
"addi.d %0, %0, 96 \n\t"
|
||||
"xvpickod.b $xr10, $xr13, $xr12 \n\t"
|
||||
|
@ -2202,8 +2205,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
|
|||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants), // %3
|
||||
"r"(shuff) // %4
|
||||
: "memory"
|
||||
);
|
||||
: "memory");
|
||||
}
|
||||
|
||||
void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
|
|
|
@ -1679,7 +1679,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
|
|||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
asm volatile(
|
||||
asm volatile(
|
||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
@ -1688,31 +1688,32 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
|
|||
"vld $vr4, %0, 0 \n\t"
|
||||
"vld $vr5, %0, 16 \n\t"
|
||||
"vld $vr6, %0, 32 \n\t"
|
||||
"vld $vr7, %0, 48 \n\t" // load 16 pixels of ARGB
|
||||
"vld $vr7, %0, 48 \n\t" // load 16 pixels of
|
||||
// ARGB
|
||||
"vor.v $vr12, $vr3, $vr3 \n\t"
|
||||
"vor.v $vr13, $vr3, $vr3 \n\t"
|
||||
"addi.d %2, %2, -16 \n\t" // 16 processed per loop.
|
||||
"vpickev.b $vr8, $vr5, $vr4 \n\t" //BR
|
||||
"addi.d %2, %2, -16 \n\t" // 16 processed per
|
||||
// loop.
|
||||
"vpickev.b $vr8, $vr5, $vr4 \n\t" // BR
|
||||
"vpickev.b $vr10, $vr7, $vr6 \n\t"
|
||||
"vpickod.b $vr9, $vr5, $vr4 \n\t" //GA
|
||||
"vpickod.b $vr9, $vr5, $vr4 \n\t" // GA
|
||||
"vpickod.b $vr11, $vr7, $vr6 \n\t"
|
||||
"vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" //B
|
||||
"vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B
|
||||
"vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t"
|
||||
"vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" //G
|
||||
"vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G
|
||||
"vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t"
|
||||
"vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" //R
|
||||
"vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R
|
||||
"vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t"
|
||||
"addi.d %0, %0, 64 \n\t"
|
||||
"vpickod.b $vr10, $vr13, $vr12 \n\t"
|
||||
"vst $vr10, %1, 0 \n\t"
|
||||
"addi.d %1, %1, 16 \n\t"
|
||||
"bnez %2, 1b \n\t"
|
||||
: "+&r"(src_argb), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "+&r"(src_argb), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants)
|
||||
: "memory"
|
||||
);
|
||||
: "memory");
|
||||
}
|
||||
|
||||
void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
|
@ -1737,7 +1738,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
|
|||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
asm volatile(
|
||||
asm volatile(
|
||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
@ -1746,31 +1747,32 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
|
|||
"vld $vr4, %0, 0 \n\t"
|
||||
"vld $vr5, %0, 16 \n\t"
|
||||
"vld $vr6, %0, 32 \n\t"
|
||||
"vld $vr7, %0, 48 \n\t" // load 16 pixels of RGBA
|
||||
"vld $vr7, %0, 48 \n\t" // load 16 pixels of
|
||||
// RGBA
|
||||
"vor.v $vr12, $vr3, $vr3 \n\t"
|
||||
"vor.v $vr13, $vr3, $vr3 \n\t"
|
||||
"addi.d %2, %2, -16 \n\t" // 16 processed per loop.
|
||||
"vpickev.b $vr8, $vr5, $vr4 \n\t" //AG
|
||||
"addi.d %2, %2, -16 \n\t" // 16 processed per
|
||||
// loop.
|
||||
"vpickev.b $vr8, $vr5, $vr4 \n\t" // AG
|
||||
"vpickev.b $vr10, $vr7, $vr6 \n\t"
|
||||
"vpickod.b $vr9, $vr5, $vr4 \n\t" //BR
|
||||
"vpickod.b $vr9, $vr5, $vr4 \n\t" // BR
|
||||
"vpickod.b $vr11, $vr7, $vr6 \n\t"
|
||||
"vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" //B
|
||||
"vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B
|
||||
"vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t"
|
||||
"vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" //G
|
||||
"vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G
|
||||
"vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t"
|
||||
"vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" //R
|
||||
"vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R
|
||||
"vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t"
|
||||
"addi.d %0, %0, 64 \n\t"
|
||||
"vpickod.b $vr10, $vr13, $vr12 \n\t"
|
||||
"vst $vr10, %1, 0 \n\t"
|
||||
"addi.d %1, %1, 16 \n\t"
|
||||
"bnez %2, 1b \n\t"
|
||||
: "+&r"(src_rgba), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "+&r"(src_rgba), // %0
|
||||
"+&r"(dst_y), // %1
|
||||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants)
|
||||
: "memory"
|
||||
);
|
||||
: "memory");
|
||||
}
|
||||
|
||||
void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
|
@ -1789,11 +1791,12 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
|
|||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
|
||||
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
|
||||
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||
asm volatile(
|
||||
int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
|
||||
20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6,
|
||||
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
|
||||
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
|
||||
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||
asm volatile(
|
||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
@ -1805,19 +1808,21 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
|
|||
"1: \n\t"
|
||||
"vld $vr8, %0, 0 \n\t"
|
||||
"vld $vr9, %0, 16 \n\t"
|
||||
"vld $vr10, %0, 32 \n\t" // load 16 pixels of RGB
|
||||
"vld $vr10, %0, 32 \n\t" // load 16 pixels of
|
||||
// RGB
|
||||
"vor.v $vr12, $vr3, $vr3 \n\t"
|
||||
"vor.v $vr13, $vr3, $vr3 \n\t"
|
||||
"addi.d %2, %2, -16 \n\t" // 16 processed per loop.
|
||||
"addi.d %2, %2, -16 \n\t" // 16 processed per
|
||||
// loop.
|
||||
"vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t"
|
||||
"vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t"
|
||||
"vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t"
|
||||
"vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t"
|
||||
"vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" //G
|
||||
"vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G
|
||||
"vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t"
|
||||
"vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" //B
|
||||
"vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B
|
||||
"vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t"
|
||||
"vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" //R
|
||||
"vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R
|
||||
"vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t"
|
||||
"addi.d %0, %0, 48 \n\t"
|
||||
"vpickod.b $vr10, $vr13, $vr12 \n\t"
|
||||
|
@ -1829,8 +1834,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
|
|||
"+&r"(width) // %2
|
||||
: "r"(rgbconstants), // %3
|
||||
"r"(shuff) // %4
|
||||
: "memory"
|
||||
);
|
||||
: "memory");
|
||||
}
|
||||
|
||||
void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
|
|
|
@ -820,6 +820,28 @@ void MergeUVRow_NEON(const uint8_t* src_u,
|
|||
: "cc", "memory", "v0", "v1" // Clobber List
|
||||
);
|
||||
}
|
||||
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
||||
void MergeUVRow_NEON1(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.16b,v2.16b}, [%0], #32 \n" // load U
|
||||
"ld1 {v1.16b,v3.16b}, [%1], #32 \n" // load V
|
||||
"subs %w3, %w3, #32 \n" // 32 processed per loop
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"st2 {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n" // store 32 UV
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_v), // %1
|
||||
"+r"(dst_uv), // %2
|
||||
"+r"(width) // %3 // Output registers
|
||||
: // Input registers
|
||||
: "cc", "memory", "v0", "v1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||
const uint16_t* src_v,
|
||||
|
|
|
@ -1065,10 +1065,10 @@ void ScalePlaneBilinearDown(int src_width,
|
|||
|
||||
const int max_y = (src_height - 1) << 16;
|
||||
int j;
|
||||
void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
|
||||
void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
|
||||
int dst_width, int x, int dx) =
|
||||
(src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
|
||||
void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
|
||||
void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_C;
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
|
||||
|
@ -1188,10 +1188,10 @@ void ScalePlaneBilinearDown_16(int src_width,
|
|||
|
||||
const int max_y = (src_height - 1) << 16;
|
||||
int j;
|
||||
void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
|
||||
void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
|
||||
int dst_width, int x, int dx) =
|
||||
(src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
|
||||
void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
|
||||
void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_16_C;
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
|
||||
|
@ -1276,10 +1276,10 @@ void ScalePlaneBilinearUp(int src_width,
|
|||
int dx = 0;
|
||||
int dy = 0;
|
||||
const int max_y = (src_height - 1) << 16;
|
||||
void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
|
||||
void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_C;
|
||||
void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
|
||||
void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleFilterCols_C : ScaleCols_C;
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
|
||||
|
@ -1744,10 +1744,10 @@ void ScalePlaneBilinearUp_16(int src_width,
|
|||
int dx = 0;
|
||||
int dy = 0;
|
||||
const int max_y = (src_height - 1) << 16;
|
||||
void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
|
||||
void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_16_C;
|
||||
void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
|
||||
void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
|
||||
|
@ -1872,7 +1872,7 @@ static void ScalePlaneSimple(int src_width,
|
|||
const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr) {
|
||||
int i;
|
||||
void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
|
||||
void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
|
||||
int x, int dx) = ScaleCols_C;
|
||||
// Initial source x/y coordinate and step values as 16.16 fixed point.
|
||||
int x = 0;
|
||||
|
@ -1909,7 +1909,7 @@ static void ScalePlaneSimple_16(int src_width,
|
|||
const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr) {
|
||||
int i;
|
||||
void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
|
||||
void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
|
||||
int x, int dx) = ScaleCols_16_C;
|
||||
// Initial source x/y coordinate and step values as 16.16 fixed point.
|
||||
int x = 0;
|
||||
|
|
|
@ -1633,7 +1633,7 @@ void ScalePlaneVertical(int src_height,
|
|||
enum FilterMode filtering) {
|
||||
// TODO(fbarchard): Allow higher bpp.
|
||||
int dst_width_bytes = dst_width * bpp;
|
||||
void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
|
||||
void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_C;
|
||||
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
|
||||
|
@ -1712,7 +1712,7 @@ void ScalePlaneVertical_16(int src_height,
|
|||
enum FilterMode filtering) {
|
||||
// TODO(fbarchard): Allow higher wpp.
|
||||
int dst_width_words = dst_width * wpp;
|
||||
void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
|
||||
void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_16_C;
|
||||
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
|
||||
|
@ -1791,7 +1791,7 @@ void ScalePlaneVertical_16To8(int src_height,
|
|||
// TODO(fbarchard): Allow higher wpp.
|
||||
int dst_width_words = dst_width * wpp;
|
||||
// TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
|
||||
void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb,
|
||||
void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb,
|
||||
ptrdiff_t src_stride, int scale, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_16To8_C;
|
||||
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
|
||||
|
|
|
@ -14,6 +14,10 @@
|
|||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/rotate.h"
|
||||
|
||||
#ifdef ENABLE_ROW_TESTS
|
||||
#include "libyuv/rotate_row.h"
|
||||
#endif
|
||||
|
||||
namespace libyuv {
|
||||
|
||||
#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
|
||||
|
@ -858,4 +862,47 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
|
|||
disable_cpu_flags_, benchmark_cpu_info_);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_ROW_TESTS)
|
||||
|
||||
TEST_F(LibYUVRotateTest, Transpose4x4) {
|
||||
// dst width and height
|
||||
const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
|
||||
const int height = 4;
|
||||
align_buffer_page_end(src_pixels, height * width * 4);
|
||||
align_buffer_page_end(dst_pixels_c, width * height * 4);
|
||||
align_buffer_page_end(dst_pixels_opt, width * height * 4);
|
||||
|
||||
MemRandomize(src_pixels, height * width * 4);
|
||||
memset(dst_pixels_c, 1, width * height * 4);
|
||||
memset(dst_pixels_opt, 1, width * height * 4);
|
||||
|
||||
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
||||
(uint8_t*)dst_pixels_c, width * 4, width);
|
||||
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
#if defined(__aarch64__)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
|
||||
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||
} else {
|
||||
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
||||
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||
}
|
||||
#else
|
||||
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
||||
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
// for (int i = 0; i < width * height; ++i) {
|
||||
// EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
// }
|
||||
|
||||
free_aligned_buffer_page_end(src_pixels);
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
free_aligned_buffer_page_end(dst_pixels_opt);
|
||||
}
|
||||
|
||||
#endif // ENABLE_ROW_TESTS
|
||||
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -23,7 +23,7 @@ origin:
|
|||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: b2528b0be934de1918e20c85fc170d809eeb49ab
|
||||
revision: 2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
Загрузка…
Ссылка в новой задаче