Remove unused assembly sources and associated tests.
Change-Id: Ic8386743b1852ca1074528d04e2adc1d191b091b
This commit is contained in:
Родитель
dbfec2a816
Коммит
0d3aeda300
|
@ -374,8 +374,6 @@ set(AOM_UNIT_TEST_SOURCES
|
|||
#"${AOM_ROOT}/test/accounting_test.cc"
|
||||
"${AOM_ROOT}/test/acm_random.h"
|
||||
"${AOM_ROOT}/test/active_map_test.cc"
|
||||
# not in test.mk
|
||||
#"${AOM_ROOT}/test/add_noise_test.cc"
|
||||
"${AOM_ROOT}/test/altref_test.cc"
|
||||
"${AOM_ROOT}/test/android"
|
||||
# requires CONFIG_ANS
|
||||
|
|
|
@ -1,195 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
const int16_t aom_rv[] = {
|
||||
8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14,
|
||||
4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0,
|
||||
3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8,
|
||||
2, 9, 7, 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3,
|
||||
2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, 4, 14, 4, 10, 0,
|
||||
8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, 0, 10, 0, 5,
|
||||
13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 4, 7,
|
||||
3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1,
|
||||
12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9,
|
||||
6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
|
||||
3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2,
|
||||
0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 10, 5, 8, 0, 11, 6,
|
||||
7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 8, 9,
|
||||
4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2,
|
||||
7, 2, 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3,
|
||||
0, 11, 8, 13, 1, 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7,
|
||||
1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0,
|
||||
8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12,
|
||||
12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0,
|
||||
3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
|
||||
4, 3, 5, 6, 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12,
|
||||
3, 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6,
|
||||
2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, 0, 9, 5, 5, 11, 10, 13,
|
||||
9, 10, 13,
|
||||
};
|
||||
|
||||
void aom_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line, int cols,
|
||||
unsigned char *f, int size) {
|
||||
unsigned char *p_src, *p_dst;
|
||||
int row;
|
||||
int col;
|
||||
unsigned char v;
|
||||
unsigned char d[4];
|
||||
|
||||
for (row = 0; row < size; row++) {
|
||||
/* post_proc_down for one row */
|
||||
p_src = src_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
for (col = 0; col < cols; col++) {
|
||||
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
|
||||
unsigned char p_above1 = p_src[col - src_pixels_per_line];
|
||||
unsigned char p_below1 = p_src[col + src_pixels_per_line];
|
||||
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
|
||||
|
||||
v = p_src[col];
|
||||
|
||||
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
|
||||
(abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_above2 + p_above1 + 1) >> 1;
|
||||
k2 = (p_below2 + p_below1 + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
p_dst[col] = v;
|
||||
}
|
||||
|
||||
/* now post_proc_across */
|
||||
p_src = dst_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
p_src[-2] = p_src[-1] = p_src[0];
|
||||
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
|
||||
|
||||
for (col = 0; col < cols; col++) {
|
||||
v = p_src[col];
|
||||
|
||||
if ((abs(v - p_src[col - 2]) < f[col]) &&
|
||||
(abs(v - p_src[col - 1]) < f[col]) &&
|
||||
(abs(v - p_src[col + 1]) < f[col]) &&
|
||||
(abs(v - p_src[col + 2]) < f[col])) {
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
|
||||
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
d[col & 3] = v;
|
||||
|
||||
if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
|
||||
}
|
||||
|
||||
/* handle the last two pixels */
|
||||
p_dst[col - 2] = d[(col - 2) & 3];
|
||||
p_dst[col - 1] = d[(col - 1) & 3];
|
||||
|
||||
/* next row */
|
||||
src_ptr += src_pixels_per_line;
|
||||
dst_ptr += dst_pixels_per_line;
|
||||
}
|
||||
}
|
||||
|
||||
void aom_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
|
||||
int cols, int flimit) {
|
||||
int r, c, i;
|
||||
|
||||
unsigned char *s = src;
|
||||
unsigned char d[16];
|
||||
|
||||
for (r = 0; r < rows; r++) {
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
|
||||
for (i = -8; i < 0; i++) s[i] = s[0];
|
||||
|
||||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
|
||||
|
||||
for (i = -8; i <= 6; i++) {
|
||||
sumsq += s[i] * s[i];
|
||||
sum += s[i];
|
||||
d[i + 8] = 0;
|
||||
}
|
||||
|
||||
for (c = 0; c < cols + 8; c++) {
|
||||
int x = s[c + 7] - s[c - 8];
|
||||
int y = s[c + 7] + s[c - 8];
|
||||
|
||||
sum += x;
|
||||
sumsq += x * y;
|
||||
|
||||
d[c & 15] = s[c];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit) {
|
||||
d[c & 15] = (8 + sum + s[c]) >> 4;
|
||||
}
|
||||
|
||||
s[c - 8] = d[(c - 8) & 15];
|
||||
}
|
||||
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
void aom_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
|
||||
int flimit) {
|
||||
int r, c, i;
|
||||
const int16_t *rv3 = &aom_rv[63 & rand()];
|
||||
|
||||
for (c = 0; c < cols; c++) {
|
||||
unsigned char *s = &dst[c];
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
unsigned char d[16];
|
||||
const int16_t *rv2 = rv3 + ((c * 17) & 127);
|
||||
|
||||
for (i = -8; i < 0; i++) s[i * pitch] = s[0];
|
||||
|
||||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
|
||||
|
||||
for (i = -8; i <= 6; i++) {
|
||||
sumsq += s[i * pitch] * s[i * pitch];
|
||||
sum += s[i * pitch];
|
||||
}
|
||||
|
||||
for (r = 0; r < rows + 8; r++) {
|
||||
sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
|
||||
sum += s[7 * pitch] - s[-8 * pitch];
|
||||
d[r & 15] = s[0];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit) {
|
||||
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
|
||||
}
|
||||
if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,682 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "./macros_msa.h"
|
||||
|
||||
extern const int16_t aom_rv[];
|
||||
|
||||
#define AOM_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \
|
||||
out1, out2, out3, out4, out5, out6, out7, \
|
||||
out8, out9, out10, out11, out12, out13, out14, \
|
||||
out15) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3, temp4; \
|
||||
v8i16 temp5, temp6, temp7, temp8, temp9; \
|
||||
\
|
||||
ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
|
||||
temp3); \
|
||||
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
|
||||
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
|
||||
ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
|
||||
temp3); \
|
||||
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_UB(temp5, temp4, out8, out10); \
|
||||
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_UB(temp5, temp4, out12, out14); \
|
||||
out0 = (v16u8)temp6; \
|
||||
out2 = (v16u8)temp7; \
|
||||
out4 = (v16u8)temp8; \
|
||||
out6 = (v16u8)temp9; \
|
||||
out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
|
||||
out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
|
||||
out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
|
||||
out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
|
||||
out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
|
||||
out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
|
||||
out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
|
||||
out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
|
||||
}
|
||||
|
||||
#define AOM_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
|
||||
ref, out) \
|
||||
{ \
|
||||
v16u8 temp0, temp1; \
|
||||
\
|
||||
temp1 = __msa_aver_u_b(above2_in, above1_in); \
|
||||
temp0 = __msa_aver_u_b(below2_in, below1_in); \
|
||||
temp1 = __msa_aver_u_b(temp1, temp0); \
|
||||
out = __msa_aver_u_b(src_in, temp1); \
|
||||
temp0 = __msa_asub_u_b(src_in, above2_in); \
|
||||
temp1 = __msa_asub_u_b(src_in, above1_in); \
|
||||
temp0 = (temp0 < ref); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
temp1 = __msa_asub_u_b(src_in, below1_in); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
temp1 = __msa_asub_u_b(src_in, below2_in); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
out = __msa_bmz_v(out, src_in, temp0); \
|
||||
}
|
||||
|
||||
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
|
||||
in10, in11, in12, in13, in14, in15) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3, temp4; \
|
||||
v8i16 temp5, temp6, temp7, temp8, temp9; \
|
||||
\
|
||||
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
|
||||
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
|
||||
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
|
||||
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
|
||||
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
|
||||
ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
|
||||
ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
|
||||
ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
|
||||
ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
|
||||
ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
|
||||
ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
|
||||
in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
|
||||
in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
|
||||
ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
|
||||
in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
|
||||
in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
|
||||
ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3, \
|
||||
temp4, temp5); \
|
||||
ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
|
||||
temp7, temp8, temp9); \
|
||||
ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
|
||||
in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
|
||||
in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
|
||||
ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
|
||||
in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
|
||||
in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
|
||||
}
|
||||
|
||||
#define AOM_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
|
||||
in9, in10, in11) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3; \
|
||||
v8i16 temp4, temp5, temp6, temp7; \
|
||||
\
|
||||
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
|
||||
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
|
||||
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
|
||||
ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
|
||||
temp4 = __msa_ilvr_h(temp5, temp4); \
|
||||
ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
|
||||
temp5 = __msa_ilvr_h(temp7, temp6); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
|
||||
in0 = (v16u8)temp0; \
|
||||
in2 = (v16u8)temp1; \
|
||||
in4 = (v16u8)temp2; \
|
||||
in6 = (v16u8)temp3; \
|
||||
in8 = (v16u8)temp6; \
|
||||
in10 = (v16u8)temp7; \
|
||||
in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
|
||||
in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
|
||||
in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
|
||||
in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
|
||||
in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
|
||||
in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
|
||||
}
|
||||
|
||||
static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride, int32_t cols,
|
||||
uint8_t *f) {
|
||||
uint8_t *p_src = src_ptr;
|
||||
uint8_t *p_dst = dst_ptr;
|
||||
uint8_t *f_orig = f;
|
||||
uint8_t *p_dst_st = dst_ptr;
|
||||
uint16_t col;
|
||||
uint64_t out0, out1, out2, out3;
|
||||
v16u8 above2, above1, below2, below1, src, ref, ref_temp;
|
||||
v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
|
||||
v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
|
||||
|
||||
for (col = (cols / 16); col--;) {
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
p_dst, dst_stride);
|
||||
|
||||
p_dst += 16;
|
||||
p_src += 16;
|
||||
f += 16;
|
||||
}
|
||||
|
||||
if (0 != (cols / 16)) {
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
out0 = __msa_copy_u_d((v2i64)inter0, 0);
|
||||
out1 = __msa_copy_u_d((v2i64)inter1, 0);
|
||||
out2 = __msa_copy_u_d((v2i64)inter2, 0);
|
||||
out3 = __msa_copy_u_d((v2i64)inter3, 0);
|
||||
SD4(out0, out1, out2, out3, p_dst, dst_stride);
|
||||
|
||||
out0 = __msa_copy_u_d((v2i64)inter4, 0);
|
||||
out1 = __msa_copy_u_d((v2i64)inter5, 0);
|
||||
out2 = __msa_copy_u_d((v2i64)inter6, 0);
|
||||
out3 = __msa_copy_u_d((v2i64)inter7, 0);
|
||||
SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
|
||||
}
|
||||
|
||||
f = f_orig;
|
||||
p_dst = dst_ptr - 2;
|
||||
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7);
|
||||
|
||||
for (col = 0; col < (cols / 8); ++col) {
|
||||
ref = LD_UB(f);
|
||||
f += 8;
|
||||
AOM_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9, inter10, inter11);
|
||||
if (0 == col) {
|
||||
above2 = inter2;
|
||||
above1 = inter2;
|
||||
} else {
|
||||
above2 = inter0;
|
||||
above1 = inter1;
|
||||
}
|
||||
src = inter2;
|
||||
below1 = inter3;
|
||||
below2 = inter4;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
|
||||
above2 = inter5;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
|
||||
above1 = inter6;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
|
||||
src = inter7;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
|
||||
AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
|
||||
below1 = inter8;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
|
||||
AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
|
||||
below2 = inter9;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above2 = inter9;
|
||||
} else {
|
||||
above2 = inter10;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above1 = inter9;
|
||||
} else {
|
||||
above1 = inter11;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
|
||||
TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
|
||||
inter9, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
inter8, inter9);
|
||||
p_dst += 8;
|
||||
LD_UB2(p_dst, dst_stride, inter0, inter1);
|
||||
ST8x1_UB(inter2, p_dst_st);
|
||||
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
|
||||
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
|
||||
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
|
||||
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
|
||||
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
|
||||
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
|
||||
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
|
||||
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
|
||||
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
|
||||
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
|
||||
p_dst_st += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride, int32_t cols,
|
||||
uint8_t *f) {
|
||||
uint8_t *p_src = src_ptr;
|
||||
uint8_t *p_dst = dst_ptr;
|
||||
uint8_t *p_dst_st = dst_ptr;
|
||||
uint8_t *f_orig = f;
|
||||
uint16_t col;
|
||||
v16u8 above2, above1, below2, below1;
|
||||
v16u8 src, ref, ref_temp;
|
||||
v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
|
||||
v16u8 inter7, inter8, inter9, inter10, inter11;
|
||||
v16u8 inter12, inter13, inter14, inter15;
|
||||
|
||||
for (col = (cols / 16); col--;) {
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
src = LD_UB(p_src + 10 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
|
||||
below1 = LD_UB(p_src + 11 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
|
||||
below2 = LD_UB(p_src + 12 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
|
||||
above2 = LD_UB(p_src + 13 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
|
||||
above1 = LD_UB(p_src + 14 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
|
||||
src = LD_UB(p_src + 15 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
|
||||
below1 = LD_UB(p_src + 16 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
|
||||
below2 = LD_UB(p_src + 17 * src_stride);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
|
||||
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
p_dst, dst_stride);
|
||||
ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
|
||||
p_dst + 8 * dst_stride, dst_stride);
|
||||
p_src += 16;
|
||||
p_dst += 16;
|
||||
f += 16;
|
||||
}
|
||||
|
||||
f = f_orig;
|
||||
p_dst = dst_ptr - 2;
|
||||
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7);
|
||||
LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
|
||||
inter12, inter13, inter14, inter15);
|
||||
|
||||
for (col = 0; col < cols / 8; ++col) {
|
||||
ref = LD_UB(f);
|
||||
f += 8;
|
||||
TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
|
||||
inter7, inter8, inter9, inter10, inter11, inter12, inter13,
|
||||
inter14, inter15);
|
||||
if (0 == col) {
|
||||
above2 = inter2;
|
||||
above1 = inter2;
|
||||
} else {
|
||||
above2 = inter0;
|
||||
above1 = inter1;
|
||||
}
|
||||
|
||||
src = inter2;
|
||||
below1 = inter3;
|
||||
below2 = inter4;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
|
||||
above2 = inter5;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
|
||||
above1 = inter6;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
|
||||
src = inter7;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
|
||||
AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
|
||||
below1 = inter8;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
|
||||
AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
|
||||
below2 = inter9;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
|
||||
AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above2 = inter9;
|
||||
} else {
|
||||
above2 = inter10;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
|
||||
AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above1 = inter9;
|
||||
} else {
|
||||
above1 = inter11;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
|
||||
AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
|
||||
AOM_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
inter8, inter9, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9, inter10, inter11,
|
||||
inter12, inter13, inter14, inter15, above2, above1);
|
||||
|
||||
p_dst += 8;
|
||||
LD_UB2(p_dst, dst_stride, inter0, inter1);
|
||||
ST8x1_UB(inter2, p_dst_st);
|
||||
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
|
||||
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
|
||||
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
|
||||
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
|
||||
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
|
||||
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
|
||||
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
|
||||
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
|
||||
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
|
||||
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
|
||||
LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
|
||||
ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
|
||||
ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
|
||||
LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
|
||||
ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
|
||||
ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
|
||||
LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
|
||||
ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
|
||||
ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
|
||||
LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
|
||||
ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
|
||||
ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
|
||||
p_dst_st += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void aom_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride, int32_t cols,
|
||||
uint8_t *f, int32_t size) {
|
||||
if (8 == size) {
|
||||
postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
|
||||
} else if (16 == size) {
|
||||
postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
|
||||
int32_t rows, int32_t cols, int32_t flimit) {
|
||||
int32_t row, col, cnt;
|
||||
uint8_t *src_dup = src_ptr;
|
||||
v16u8 src0, src, tmp_orig;
|
||||
v16u8 tmp = { 0 };
|
||||
v16i8 zero = { 0 };
|
||||
v8u16 sum_h, src_r_h, src_l_h;
|
||||
v4u32 src_r_w, src_l_w;
|
||||
v4i32 flimit_vec;
|
||||
|
||||
flimit_vec = __msa_fill_w(flimit);
|
||||
for (row = rows; row--;) {
|
||||
int32_t sum_sq = 0;
|
||||
int32_t sum = 0;
|
||||
src0 = (v16u8)__msa_fill_b(src_dup[0]);
|
||||
ST8x1_UB(src0, (src_dup - 8));
|
||||
|
||||
src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
|
||||
ST_UB(src0, src_dup + cols);
|
||||
src_dup[cols + 16] = src_dup[cols - 1];
|
||||
tmp_orig = (v16u8)__msa_ldi_b(0);
|
||||
tmp_orig[15] = tmp[15];
|
||||
src = LD_UB(src_dup - 8);
|
||||
src[15] = 0;
|
||||
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
|
||||
src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
|
||||
src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
|
||||
sum_sq = HADD_SW_S32(src_r_w);
|
||||
sum_sq += HADD_SW_S32(src_l_w);
|
||||
sum_h = __msa_hadd_u_h(src, src);
|
||||
sum = HADD_UH_U32(sum_h);
|
||||
{
|
||||
v16u8 src7, src8, src_r, src_l;
|
||||
v16i8 mask;
|
||||
v8u16 add_r, add_l;
|
||||
v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
|
||||
v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
|
||||
v4i32 sub0, sub1, sub2, sub3;
|
||||
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
|
||||
v4i32 mul0, mul1, mul2, mul3;
|
||||
v4i32 total0, total1, total2, total3;
|
||||
v8i16 const8 = __msa_fill_h(8);
|
||||
|
||||
src7 = LD_UB(src_dup + 7);
|
||||
src8 = LD_UB(src_dup - 8);
|
||||
for (col = 0; col < (cols >> 4); ++col) {
|
||||
ILVRL_B2_UB(src7, src8, src_r, src_l);
|
||||
HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
|
||||
|
||||
sum_r[0] = sum + sub_r[0];
|
||||
for (cnt = 0; cnt < 7; ++cnt) {
|
||||
sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
|
||||
}
|
||||
sum_l[0] = sum_r[7] + sub_l[0];
|
||||
for (cnt = 0; cnt < 7; ++cnt) {
|
||||
sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
|
||||
}
|
||||
sum = sum_l[7];
|
||||
src = LD_UB(src_dup + 16 * col);
|
||||
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
|
||||
src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
|
||||
src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
|
||||
tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
|
||||
|
||||
HADD_UB2_UH(src_r, src_l, add_r, add_l);
|
||||
UNPCK_SH_SW(sub_r, sub0, sub1);
|
||||
UNPCK_SH_SW(sub_l, sub2, sub3);
|
||||
ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
|
||||
ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
|
||||
MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
|
||||
mul2, mul3);
|
||||
sum_sq0[0] = sum_sq + mul0[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
|
||||
}
|
||||
sum_sq1[0] = sum_sq0[3] + mul1[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
|
||||
}
|
||||
sum_sq2[0] = sum_sq1[3] + mul2[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
|
||||
}
|
||||
sum_sq3[0] = sum_sq2[3] + mul3[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
|
||||
}
|
||||
sum_sq = sum_sq3[3];
|
||||
|
||||
UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
|
||||
UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
|
||||
total0 = sum_sq0 * __msa_ldi_w(15);
|
||||
total0 -= sum0_w * sum0_w;
|
||||
total1 = sum_sq1 * __msa_ldi_w(15);
|
||||
total1 -= sum1_w * sum1_w;
|
||||
total2 = sum_sq2 * __msa_ldi_w(15);
|
||||
total2 -= sum2_w * sum2_w;
|
||||
total3 = sum_sq3 * __msa_ldi_w(15);
|
||||
total3 -= sum3_w * sum3_w;
|
||||
total0 = (total0 < flimit_vec);
|
||||
total1 = (total1 < flimit_vec);
|
||||
total2 = (total2 < flimit_vec);
|
||||
total3 = (total3 < flimit_vec);
|
||||
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
|
||||
mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
|
||||
tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
|
||||
|
||||
if (col == 0) {
|
||||
uint64_t src_d;
|
||||
|
||||
src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
|
||||
SD(src_d, (src_dup - 8));
|
||||
}
|
||||
|
||||
src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
|
||||
src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
|
||||
ST_UB(tmp, (src_dup + (16 * col)));
|
||||
}
|
||||
|
||||
src_dup += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void aom_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
|
||||
int32_t cols, int32_t flimit) {
|
||||
int32_t row, col, cnt, i;
|
||||
const int16_t *rv3 = &aom_rv[63 & rand()];
|
||||
v4i32 flimit_vec;
|
||||
v16u8 dst7, dst8, dst_r_b, dst_l_b;
|
||||
v16i8 mask;
|
||||
v8u16 add_r, add_l;
|
||||
v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
|
||||
v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
|
||||
|
||||
flimit_vec = __msa_fill_w(flimit);
|
||||
|
||||
for (col = 0; col < (cols >> 4); ++col) {
|
||||
uint8_t *dst_tmp = &dst_ptr[col << 4];
|
||||
v16u8 dst;
|
||||
v16i8 zero = { 0 };
|
||||
v16u8 tmp[16];
|
||||
v8i16 mult0, mult1, rv2_0, rv2_1;
|
||||
v8i16 sum0_h = { 0 };
|
||||
v8i16 sum1_h = { 0 };
|
||||
v4i32 mul0 = { 0 };
|
||||
v4i32 mul1 = { 0 };
|
||||
v4i32 mul2 = { 0 };
|
||||
v4i32 mul3 = { 0 };
|
||||
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
|
||||
v4i32 add0, add1, add2, add3;
|
||||
const int16_t *rv2[16];
|
||||
|
||||
dst = LD_UB(dst_tmp);
|
||||
for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
|
||||
rv2[i] = rv3 + ((cnt * 17) & 127);
|
||||
++i;
|
||||
}
|
||||
for (cnt = -8; cnt < 0; ++cnt) {
|
||||
ST_UB(dst, dst_tmp + cnt * pitch);
|
||||
}
|
||||
|
||||
dst = LD_UB((dst_tmp + (rows - 1) * pitch));
|
||||
for (cnt = rows; cnt < rows + 17; ++cnt) {
|
||||
ST_UB(dst, dst_tmp + cnt * pitch);
|
||||
}
|
||||
for (cnt = -8; cnt <= 6; ++cnt) {
|
||||
dst = LD_UB(dst_tmp + (cnt * pitch));
|
||||
UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
|
||||
MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
|
||||
mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
|
||||
mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
|
||||
mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
|
||||
mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
|
||||
ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
|
||||
}
|
||||
|
||||
for (row = 0; row < (rows + 8); ++row) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
rv2_0[i] = *(rv2[i] + (row & 127));
|
||||
rv2_1[i] = *(rv2[i + 8] + (row & 127));
|
||||
}
|
||||
dst7 = LD_UB(dst_tmp + (7 * pitch));
|
||||
dst8 = LD_UB(dst_tmp - (8 * pitch));
|
||||
ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
|
||||
|
||||
HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
|
||||
UNPCK_SH_SW(sub_r, sub0, sub1);
|
||||
UNPCK_SH_SW(sub_l, sub2, sub3);
|
||||
sum0_h += sub_r;
|
||||
sum1_h += sub_l;
|
||||
|
||||
HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
|
||||
|
||||
ILVRL_H2_SW(zero, add_r, add0, add1);
|
||||
ILVRL_H2_SW(zero, add_l, add2, add3);
|
||||
mul0 += add0 * sub0;
|
||||
mul1 += add1 * sub1;
|
||||
mul2 += add2 * sub2;
|
||||
mul3 += add3 * sub3;
|
||||
dst = LD_UB(dst_tmp);
|
||||
ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
|
||||
dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
|
||||
dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
|
||||
tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
|
||||
|
||||
UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
|
||||
UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
|
||||
total0 = mul0 * __msa_ldi_w(15);
|
||||
total0 -= sum0_w * sum0_w;
|
||||
total1 = mul1 * __msa_ldi_w(15);
|
||||
total1 -= sum1_w * sum1_w;
|
||||
total2 = mul2 * __msa_ldi_w(15);
|
||||
total2 -= sum2_w * sum2_w;
|
||||
total3 = mul3 * __msa_ldi_w(15);
|
||||
total3 -= sum3_w * sum3_w;
|
||||
total0 = (total0 < flimit_vec);
|
||||
total1 = (total1 < flimit_vec);
|
||||
total2 = (total2 < flimit_vec);
|
||||
total3 = (total3 < flimit_vec);
|
||||
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
|
||||
mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
|
||||
tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
|
||||
|
||||
if (row >= 8) {
|
||||
ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
|
||||
}
|
||||
|
||||
dst_tmp += pitch;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,83 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
%include "aom_ports/x86_abi_support.asm"
|
||||
|
||||
;void aom_plane_add_noise_sse2(unsigned char *start, unsigned char *noise,
|
||||
; unsigned char blackclamp[16],
|
||||
; unsigned char whiteclamp[16],
|
||||
; unsigned char bothclamp[16],
|
||||
; unsigned int width, unsigned int height,
|
||||
; int pitch)
|
||||
global sym(aom_plane_add_noise_sse2) PRIVATE
|
||||
sym(aom_plane_add_noise_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 8
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; get the clamps in registers
|
||||
mov rdx, arg(2) ; blackclamp
|
||||
movdqu xmm3, [rdx]
|
||||
mov rdx, arg(3) ; whiteclamp
|
||||
movdqu xmm4, [rdx]
|
||||
mov rdx, arg(4) ; bothclamp
|
||||
movdqu xmm5, [rdx]
|
||||
|
||||
.addnoise_loop:
|
||||
call sym(LIBAOM_RAND) WRT_PLT
|
||||
mov rcx, arg(1) ;noise
|
||||
and rax, 0xff
|
||||
add rcx, rax
|
||||
|
||||
mov rdi, rcx
|
||||
movsxd rcx, dword arg(5) ;[Width]
|
||||
mov rsi, arg(0) ;Pos
|
||||
xor rax,rax
|
||||
|
||||
.addnoise_nextset:
|
||||
movdqu xmm1,[rsi+rax] ; get the source
|
||||
|
||||
psubusb xmm1, xmm3 ; subtract black clamp
|
||||
paddusb xmm1, xmm5 ; add both clamp
|
||||
psubusb xmm1, xmm4 ; subtract whiteclamp
|
||||
|
||||
movdqu xmm2,[rdi+rax] ; get the noise for this line
|
||||
paddb xmm1,xmm2 ; add it in
|
||||
movdqu [rsi+rax],xmm1 ; store the result
|
||||
|
||||
add rax,16 ; move to the next line
|
||||
|
||||
cmp rax, rcx
|
||||
jl .addnoise_nextset
|
||||
|
||||
movsxd rax, dword arg(7) ; Pitch
|
||||
add arg(0), rax ; Start += Pitch
|
||||
sub dword arg(6), 1 ; Height -= 1
|
||||
jg .addnoise_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
rd42:
|
||||
times 8 dw 0x04
|
||||
four8s:
|
||||
times 4 dd 8
|
|
@ -1,661 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
%include "aom_ports/x86_abi_support.asm"
|
||||
|
||||
;macro in deblock functions
|
||||
%macro FIRST_2_ROWS 0
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm6, xmm0
|
||||
movdqa xmm5, xmm1
|
||||
pavgb xmm5, xmm3
|
||||
|
||||
;calculate absolute value
|
||||
psubusb xmm4, xmm1
|
||||
psubusb xmm1, xmm0
|
||||
psubusb xmm6, xmm3
|
||||
psubusb xmm3, xmm0
|
||||
paddusb xmm4, xmm1
|
||||
paddusb xmm6, xmm3
|
||||
|
||||
;get threshold
|
||||
movdqa xmm2, flimit
|
||||
pxor xmm1, xmm1
|
||||
movdqa xmm7, xmm2
|
||||
|
||||
;get mask
|
||||
psubusb xmm2, xmm4
|
||||
psubusb xmm7, xmm6
|
||||
pcmpeqb xmm2, xmm1
|
||||
pcmpeqb xmm7, xmm1
|
||||
por xmm7, xmm2
|
||||
%endmacro
|
||||
|
||||
%macro SECOND_2_ROWS 0
|
||||
movdqa xmm6, xmm0
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm2, xmm1
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
;calculate absolute value
|
||||
psubusb xmm6, xmm2
|
||||
psubusb xmm2, xmm0
|
||||
psubusb xmm4, xmm3
|
||||
psubusb xmm3, xmm0
|
||||
paddusb xmm6, xmm2
|
||||
paddusb xmm4, xmm3
|
||||
|
||||
pavgb xmm5, xmm1
|
||||
|
||||
;get threshold
|
||||
movdqa xmm2, flimit
|
||||
pxor xmm1, xmm1
|
||||
movdqa xmm3, xmm2
|
||||
|
||||
;get mask
|
||||
psubusb xmm2, xmm6
|
||||
psubusb xmm3, xmm4
|
||||
pcmpeqb xmm2, xmm1
|
||||
pcmpeqb xmm3, xmm1
|
||||
|
||||
por xmm7, xmm2
|
||||
por xmm7, xmm3
|
||||
|
||||
pavgb xmm5, xmm0
|
||||
|
||||
;decide if or not to use filtered value
|
||||
pand xmm0, xmm7
|
||||
pandn xmm7, xmm5
|
||||
paddusb xmm0, xmm7
|
||||
%endmacro
|
||||
|
||||
%macro UPDATE_FLIMIT 0
|
||||
movdqa xmm2, XMMWORD PTR [rbx]
|
||||
movdqa [rsp], xmm2
|
||||
add rbx, 16
|
||||
%endmacro
|
||||
|
||||
;void aom_post_proc_down_and_across_mb_row_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned char *dst_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; int dst_pixels_per_line,
|
||||
; int cols,
|
||||
; int *flimits,
|
||||
; int size
|
||||
;)
|
||||
global sym(aom_post_proc_down_and_across_mb_row_sse2) PRIVATE
|
||||
sym(aom_post_proc_down_and_across_mb_row_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
push rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16
|
||||
|
||||
; put flimit on stack
|
||||
mov rbx, arg(5) ;flimits ptr
|
||||
UPDATE_FLIMIT
|
||||
|
||||
%define flimit [rsp]
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(1) ;dst_ptr
|
||||
|
||||
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
|
||||
movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
|
||||
.nextrow:
|
||||
xor rdx, rdx ;col
|
||||
.nextcol:
|
||||
;load current and next 2 rows
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi + rax]
|
||||
movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
|
||||
|
||||
FIRST_2_ROWS
|
||||
|
||||
;load above 2 rows
|
||||
neg rax
|
||||
movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
|
||||
movdqu xmm3, XMMWORD PTR [rsi + rax]
|
||||
|
||||
SECOND_2_ROWS
|
||||
|
||||
movdqu XMMWORD PTR [rdi], xmm0
|
||||
|
||||
neg rax ; positive stride
|
||||
add rsi, 16
|
||||
add rdi, 16
|
||||
|
||||
add rdx, 16
|
||||
cmp edx, dword arg(4) ;cols
|
||||
jge .downdone
|
||||
UPDATE_FLIMIT
|
||||
jmp .nextcol
|
||||
|
||||
.downdone:
|
||||
; done with the all cols, start the across filtering in place
|
||||
sub rsi, rdx
|
||||
sub rdi, rdx
|
||||
|
||||
mov rbx, arg(5) ; flimits
|
||||
UPDATE_FLIMIT
|
||||
|
||||
; dup the first byte into the left border 8 times
|
||||
movq mm1, [rdi]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
punpckldq mm1, mm1
|
||||
mov rdx, -8
|
||||
movq [rdi+rdx], mm1
|
||||
|
||||
; dup the last byte into the right border
|
||||
movsxd rdx, dword arg(4)
|
||||
movq mm1, [rdi + rdx + -1]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
punpckldq mm1, mm1
|
||||
movq [rdi+rdx], mm1
|
||||
|
||||
xor rdx, rdx
|
||||
movq mm0, QWORD PTR [rdi-16];
|
||||
movq mm1, QWORD PTR [rdi-8];
|
||||
|
||||
.acrossnextcol:
|
||||
movdqu xmm0, XMMWORD PTR [rdi + rdx]
|
||||
movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
|
||||
movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
|
||||
|
||||
FIRST_2_ROWS
|
||||
|
||||
movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
|
||||
movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
|
||||
|
||||
SECOND_2_ROWS
|
||||
|
||||
movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
|
||||
movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
|
||||
movdq2q mm0, xmm0
|
||||
psrldq xmm0, 8
|
||||
movdq2q mm1, xmm0
|
||||
|
||||
add rdx, 16
|
||||
cmp edx, dword arg(4) ;cols
|
||||
jge .acrossdone
|
||||
UPDATE_FLIMIT
|
||||
jmp .acrossnextcol
|
||||
|
||||
.acrossdone:
|
||||
; last 16 pixels
|
||||
movq QWORD PTR [rdi+rdx-16], mm0
|
||||
|
||||
cmp edx, dword arg(4)
|
||||
jne .throw_last_8
|
||||
movq QWORD PTR [rdi+rdx-8], mm1
|
||||
.throw_last_8:
|
||||
; done with this rwo
|
||||
add rsi,rax ;next src line
|
||||
mov eax, dword arg(3) ;dst_pixels_per_line
|
||||
add rdi,rax ;next destination
|
||||
mov eax, dword arg(2) ;src_pixels_per_line
|
||||
|
||||
mov rbx, arg(5) ;flimits
|
||||
UPDATE_FLIMIT
|
||||
|
||||
dec rcx ;decrement count
|
||||
jnz .nextrow ;next row
|
||||
|
||||
add rsp, 16
|
||||
pop rsp
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef flimit
|
||||
|
||||
;void aom_mbpost_proc_down_xmm(unsigned char *dst,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
extern sym(aom_rv)
|
||||
global sym(aom_mbpost_proc_down_xmm) PRIVATE
|
||||
sym(aom_mbpost_proc_down_xmm):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 128+16
|
||||
|
||||
; unsigned char d[16][8] at [rsp]
|
||||
; create flimit2 at [rsp+128]
|
||||
mov eax, dword ptr arg(4) ;flimit
|
||||
mov [rsp+128], eax
|
||||
mov [rsp+128+4], eax
|
||||
mov [rsp+128+8], eax
|
||||
mov [rsp+128+12], eax
|
||||
%define flimit4 [rsp+128]
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
lea r8, [GLOBAL(sym(aom_rv))]
|
||||
%endif
|
||||
|
||||
;rows +=8;
|
||||
add dword arg(2), 8
|
||||
|
||||
;for(c=0; c<cols; c+=8)
|
||||
.loop_col:
|
||||
mov rsi, arg(0) ; s
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
movsxd rax, dword ptr arg(1) ;pitch ;
|
||||
|
||||
; this copies the last row down into the border 8 rows
|
||||
mov rdi, rsi
|
||||
mov rdx, arg(2)
|
||||
sub rdx, 9
|
||||
imul rdx, rax
|
||||
lea rdi, [rdi+rdx]
|
||||
movq xmm1, QWORD ptr[rdi] ; first row
|
||||
mov rcx, 8
|
||||
.init_borderd: ; initialize borders
|
||||
lea rdi, [rdi + rax]
|
||||
movq [rdi], xmm1
|
||||
|
||||
dec rcx
|
||||
jne .init_borderd
|
||||
|
||||
neg rax ; rax = -pitch
|
||||
|
||||
; this copies the first row up into the border 8 rows
|
||||
mov rdi, rsi
|
||||
movq xmm1, QWORD ptr[rdi] ; first row
|
||||
mov rcx, 8
|
||||
.init_border: ; initialize borders
|
||||
lea rdi, [rdi + rax]
|
||||
movq [rdi], xmm1
|
||||
|
||||
dec rcx
|
||||
jne .init_border
|
||||
|
||||
|
||||
|
||||
lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
|
||||
neg rax
|
||||
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm6, xmm6 ;
|
||||
|
||||
pxor xmm7, xmm7 ;
|
||||
mov rdi, rsi
|
||||
|
||||
mov rcx, 15 ;
|
||||
|
||||
.loop_initvar:
|
||||
movq xmm1, QWORD PTR [rdi];
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
|
||||
paddw xmm5, xmm1 ;
|
||||
pmullw xmm1, xmm1 ;
|
||||
|
||||
movdqa xmm2, xmm1 ;
|
||||
punpcklwd xmm1, xmm0 ;
|
||||
|
||||
punpckhwd xmm2, xmm0 ;
|
||||
paddd xmm6, xmm1 ;
|
||||
|
||||
paddd xmm7, xmm2 ;
|
||||
lea rdi, [rdi+rax] ;
|
||||
|
||||
dec rcx
|
||||
jne .loop_initvar
|
||||
;save the var and sum
|
||||
xor rdx, rdx
|
||||
.loop_row:
|
||||
movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
|
||||
movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpcklbw xmm2, xmm0
|
||||
|
||||
paddw xmm5, xmm2
|
||||
psubw xmm5, xmm1
|
||||
|
||||
pmullw xmm2, xmm2
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
punpcklwd xmm2, xmm0
|
||||
punpckhwd xmm4, xmm0
|
||||
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm7, xmm4
|
||||
|
||||
pmullw xmm1, xmm1
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
punpcklwd xmm1, xmm0
|
||||
psubd xmm6, xmm1
|
||||
|
||||
punpckhwd xmm2, xmm0
|
||||
psubd xmm7, xmm2
|
||||
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
pslld xmm3, 4
|
||||
|
||||
psubd xmm3, xmm6
|
||||
movdqa xmm1, xmm5
|
||||
|
||||
movdqa xmm4, xmm5
|
||||
pmullw xmm1, xmm1
|
||||
|
||||
pmulhw xmm4, xmm4
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
punpcklwd xmm1, xmm4
|
||||
punpckhwd xmm2, xmm4
|
||||
|
||||
movdqa xmm4, xmm7
|
||||
pslld xmm4, 4
|
||||
|
||||
psubd xmm4, xmm7
|
||||
|
||||
psubd xmm3, xmm1
|
||||
psubd xmm4, xmm2
|
||||
|
||||
psubd xmm3, flimit4
|
||||
psubd xmm4, flimit4
|
||||
|
||||
psrad xmm3, 31
|
||||
psrad xmm4, 31
|
||||
|
||||
packssdw xmm3, xmm4
|
||||
packsswb xmm3, xmm0
|
||||
|
||||
movq xmm1, QWORD PTR [rsi+rax*8]
|
||||
|
||||
movq xmm2, xmm1
|
||||
punpcklbw xmm1, xmm0
|
||||
|
||||
paddw xmm1, xmm5
|
||||
mov rcx, rdx
|
||||
|
||||
and rcx, 127
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
push rax
|
||||
lea rax, [GLOBAL(sym(aom_rv))]
|
||||
movdqu xmm4, [rax + rcx*2] ;aom_rv[rcx*2]
|
||||
pop rax
|
||||
%elif ABI_IS_32BIT=0
|
||||
movdqu xmm4, [r8 + rcx*2] ;aom_rv[rcx*2]
|
||||
%else
|
||||
movdqu xmm4, [sym(aom_rv) + rcx*2]
|
||||
%endif
|
||||
|
||||
paddw xmm1, xmm4
|
||||
;paddw xmm1, eight8s
|
||||
psraw xmm1, 4
|
||||
|
||||
packuswb xmm1, xmm0
|
||||
pand xmm1, xmm3
|
||||
|
||||
pandn xmm3, xmm2
|
||||
por xmm1, xmm3
|
||||
|
||||
and rcx, 15
|
||||
movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
|
||||
|
||||
cmp edx, 8
|
||||
jl .skip_assignment
|
||||
|
||||
mov rcx, rdx
|
||||
sub rcx, 8
|
||||
and rcx, 15
|
||||
movq mm0, [rsp + rcx*8] ;d[rcx*8]
|
||||
movq [rsi], mm0
|
||||
|
||||
.skip_assignment:
|
||||
lea rsi, [rsi+rax]
|
||||
|
||||
lea rdi, [rdi+rax]
|
||||
add rdx, 1
|
||||
|
||||
cmp edx, dword arg(2) ;rows
|
||||
jl .loop_row
|
||||
|
||||
add dword arg(0), 8 ; s += 8
|
||||
sub dword arg(3), 8 ; cols -= 8
|
||||
cmp dword arg(3), 0
|
||||
jg .loop_col
|
||||
|
||||
add rsp, 128+16
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef flimit4
|
||||
|
||||
|
||||
;void aom_mbpost_proc_across_ip_xmm(unsigned char *src,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
global sym(aom_mbpost_proc_across_ip_xmm) PRIVATE
|
||||
sym(aom_mbpost_proc_across_ip_xmm):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16
|
||||
|
||||
; create flimit4 at [rsp]
|
||||
mov eax, dword ptr arg(4) ;flimit
|
||||
mov [rsp], eax
|
||||
mov [rsp+4], eax
|
||||
mov [rsp+8], eax
|
||||
mov [rsp+12], eax
|
||||
%define flimit4 [rsp]
|
||||
|
||||
|
||||
;for(r=0;r<rows;r++)
|
||||
.ip_row_loop:
|
||||
|
||||
xor rdx, rdx ;sumsq=0;
|
||||
xor rcx, rcx ;sum=0;
|
||||
mov rsi, arg(0); s
|
||||
|
||||
|
||||
; dup the first byte into the left border 8 times
|
||||
movq mm1, [rsi]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
punpckldq mm1, mm1
|
||||
|
||||
mov rdi, -8
|
||||
movq [rsi+rdi], mm1
|
||||
|
||||
; dup the last byte into the right border
|
||||
movsxd rdx, dword arg(3)
|
||||
movq mm1, [rsi + rdx + -1]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
punpckldq mm1, mm1
|
||||
movq [rsi+rdx], mm1
|
||||
|
||||
.ip_var_loop:
|
||||
;for(i=-8;i<=6;i++)
|
||||
;{
|
||||
; sumsq += s[i]*s[i];
|
||||
; sum += s[i];
|
||||
;}
|
||||
movzx eax, byte [rsi+rdi]
|
||||
add ecx, eax
|
||||
mul al
|
||||
add edx, eax
|
||||
add rdi, 1
|
||||
cmp rdi, 6
|
||||
jle .ip_var_loop
|
||||
|
||||
|
||||
;mov rax, sumsq
|
||||
;movd xmm7, rax
|
||||
movd xmm7, edx
|
||||
|
||||
;mov rax, sum
|
||||
;movd xmm6, rax
|
||||
movd xmm6, ecx
|
||||
|
||||
mov rsi, arg(0) ;s
|
||||
xor rcx, rcx
|
||||
|
||||
movsxd rdx, dword arg(3) ;cols
|
||||
add rdx, 8
|
||||
pxor mm0, mm0
|
||||
pxor mm1, mm1
|
||||
|
||||
pxor xmm0, xmm0
|
||||
.nextcol4:
|
||||
|
||||
movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
|
||||
movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
|
||||
|
||||
punpcklbw xmm1, xmm0 ; expanding
|
||||
punpcklbw xmm2, xmm0 ; expanding
|
||||
|
||||
punpcklwd xmm1, xmm0 ; expanding to dwords
|
||||
punpcklwd xmm2, xmm0 ; expanding to dwords
|
||||
|
||||
psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
|
||||
paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
|
||||
|
||||
paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
|
||||
pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
|
||||
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm7, xmm1
|
||||
|
||||
pshufd xmm6, xmm6, 0 ; duplicate the last ones
|
||||
pshufd xmm7, xmm7, 0 ; duplicate the last ones
|
||||
|
||||
psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
|
||||
psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
|
||||
|
||||
pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
|
||||
pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
|
||||
|
||||
paddd xmm6, xmm4
|
||||
paddd xmm7, xmm3
|
||||
|
||||
pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
|
||||
pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
|
||||
|
||||
paddd xmm7, xmm3
|
||||
paddd xmm6, xmm4
|
||||
|
||||
pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
|
||||
pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
|
||||
|
||||
paddd xmm7, xmm3
|
||||
paddd xmm6, xmm4
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
pmaddwd xmm3, xmm3
|
||||
|
||||
movdqa xmm5, xmm7
|
||||
pslld xmm5, 4
|
||||
|
||||
psubd xmm5, xmm7
|
||||
psubd xmm5, xmm3
|
||||
|
||||
psubd xmm5, flimit4
|
||||
psrad xmm5, 31
|
||||
|
||||
packssdw xmm5, xmm0
|
||||
packsswb xmm5, xmm0
|
||||
|
||||
movd xmm1, DWORD PTR [rsi+rcx]
|
||||
movq xmm2, xmm1
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpcklwd xmm1, xmm0
|
||||
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm1, [GLOBAL(four8s)]
|
||||
|
||||
psrad xmm1, 4
|
||||
packssdw xmm1, xmm0
|
||||
|
||||
packuswb xmm1, xmm0
|
||||
pand xmm1, xmm5
|
||||
|
||||
pandn xmm5, xmm2
|
||||
por xmm5, xmm1
|
||||
|
||||
movd [rsi+rcx-8], mm0
|
||||
movq mm0, mm1
|
||||
|
||||
movdq2q mm1, xmm5
|
||||
psrldq xmm7, 12
|
||||
|
||||
psrldq xmm6, 12
|
||||
add rcx, 4
|
||||
|
||||
cmp rcx, rdx
|
||||
jl .nextcol4
|
||||
|
||||
;s+=pitch;
|
||||
movsxd rax, dword arg(1)
|
||||
add arg(0), rax
|
||||
|
||||
sub dword arg(2), 1 ;rows-=1
|
||||
cmp dword arg(2), 0
|
||||
jg .ip_row_loop
|
||||
|
||||
add rsp, 16
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef flimit4
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
four8s:
|
||||
times 4 dd 8
|
|
@ -1,153 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/postproc.h"
|
||||
#include "aom_mem/aom_mem.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO(jimbankoski): make width and height integers not unsigned.
|
||||
typedef void (*AddNoiseFunc)(unsigned char *start, char *noise,
|
||||
char blackclamp[16], char whiteclamp[16],
|
||||
char bothclamp[16], unsigned int width,
|
||||
unsigned int height, int pitch);
|
||||
|
||||
class AddNoiseTest : public ::testing::TestWithParam<AddNoiseFunc> {
|
||||
public:
|
||||
virtual void TearDown() { libaom_test::ClearSystemState(); }
|
||||
virtual ~AddNoiseTest() {}
|
||||
};
|
||||
|
||||
double stddev6(char a, char b, char c, char d, char e, char f) {
|
||||
const double n = (a + b + c + d + e + f) / 6.0;
|
||||
const double v = ((a - n) * (a - n) + (b - n) * (b - n) + (c - n) * (c - n) +
|
||||
(d - n) * (d - n) + (e - n) * (e - n) + (f - n) * (f - n)) /
|
||||
6.0;
|
||||
return sqrt(v);
|
||||
}
|
||||
|
||||
TEST_P(AddNoiseTest, CheckNoiseAdded) {
|
||||
DECLARE_ALIGNED(16, char, blackclamp[16]);
|
||||
DECLARE_ALIGNED(16, char, whiteclamp[16]);
|
||||
DECLARE_ALIGNED(16, char, bothclamp[16]);
|
||||
const int width = 64;
|
||||
const int height = 64;
|
||||
const int image_size = width * height;
|
||||
char noise[3072];
|
||||
const int clamp = aom_setup_noise(4.4, sizeof(noise), noise);
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
blackclamp[i] = clamp;
|
||||
whiteclamp[i] = clamp;
|
||||
bothclamp[i] = 2 * clamp;
|
||||
}
|
||||
|
||||
uint8_t *const s = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
|
||||
memset(s, 99, image_size);
|
||||
|
||||
ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
|
||||
bothclamp, width, height, width));
|
||||
|
||||
// Check to make sure we don't end up having either the same or no added
|
||||
// noise either vertically or horizontally.
|
||||
for (int i = 0; i < image_size - 6 * width - 6; ++i) {
|
||||
const double hd = stddev6(s[i] - 99, s[i + 1] - 99, s[i + 2] - 99,
|
||||
s[i + 3] - 99, s[i + 4] - 99, s[i + 5] - 99);
|
||||
const double vd = stddev6(s[i] - 99, s[i + width] - 99,
|
||||
s[i + 2 * width] - 99, s[i + 3 * width] - 99,
|
||||
s[i + 4 * width] - 99, s[i + 5 * width] - 99);
|
||||
|
||||
EXPECT_NE(hd, 0);
|
||||
EXPECT_NE(vd, 0);
|
||||
}
|
||||
|
||||
// Initialize pixels in the image to 255 and check for roll over.
|
||||
memset(s, 255, image_size);
|
||||
|
||||
ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
|
||||
bothclamp, width, height, width));
|
||||
|
||||
// Check to make sure don't roll over.
|
||||
for (int i = 0; i < image_size; ++i) {
|
||||
EXPECT_GT(static_cast<int>(s[i]), clamp) << "i = " << i;
|
||||
}
|
||||
|
||||
// Initialize pixels in the image to 0 and check for roll under.
|
||||
memset(s, 0, image_size);
|
||||
|
||||
ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
|
||||
bothclamp, width, height, width));
|
||||
|
||||
// Check to make sure don't roll under.
|
||||
for (int i = 0; i < image_size; ++i) {
|
||||
EXPECT_LT(static_cast<int>(s[i]), 255 - clamp) << "i = " << i;
|
||||
}
|
||||
|
||||
aom_free(s);
|
||||
}
|
||||
|
||||
TEST_P(AddNoiseTest, CheckCvsAssembly) {
|
||||
DECLARE_ALIGNED(16, char, blackclamp[16]);
|
||||
DECLARE_ALIGNED(16, char, whiteclamp[16]);
|
||||
DECLARE_ALIGNED(16, char, bothclamp[16]);
|
||||
const int width = 64;
|
||||
const int height = 64;
|
||||
const int image_size = width * height;
|
||||
char noise[3072];
|
||||
|
||||
const int clamp = aom_setup_noise(4.4, sizeof(noise), noise);
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
blackclamp[i] = clamp;
|
||||
whiteclamp[i] = clamp;
|
||||
bothclamp[i] = 2 * clamp;
|
||||
}
|
||||
|
||||
uint8_t *const s = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
|
||||
uint8_t *const d = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
|
||||
|
||||
memset(s, 99, image_size);
|
||||
memset(d, 99, image_size);
|
||||
|
||||
srand(0);
|
||||
ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
|
||||
bothclamp, width, height, width));
|
||||
srand(0);
|
||||
ASM_REGISTER_STATE_CHECK(aom_plane_add_noise_c(
|
||||
d, noise, blackclamp, whiteclamp, bothclamp, width, height, width));
|
||||
|
||||
for (int i = 0; i < image_size; ++i) {
|
||||
EXPECT_EQ(static_cast<int>(s[i]), static_cast<int>(d[i])) << "i = " << i;
|
||||
}
|
||||
|
||||
aom_free(d);
|
||||
aom_free(s);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
|
||||
::testing::Values(aom_plane_add_noise_c));
|
||||
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
|
||||
::testing::Values(aom_plane_add_noise_sse2));
|
||||
#endif
|
||||
|
||||
#if HAVE_MSA
|
||||
INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
|
||||
::testing::Values(aom_plane_add_noise_msa));
|
||||
#endif
|
||||
} // namespace
|
Загрузка…
Ссылка в новой задаче