Merge changes I40454d26,I892e76d5,I865ab3f9,I4a4bec17,I61c4351e,I37eb3559,I1031c556,I8c8f1f42
* changes: delete vp9_loopfilter_sse2.asm vp9_loopfilter_intrin_sse2: cosmetics: fix indent delete x86/vp9_loopfilter_x86.h vp9_loopfilter_intrin_sse2: make some funcs static vp9_loopfilter_intrin_sse2: remove unused uv funcs vp9_loopfilter: remove uv function typedef filter_block_plane: reuse some constants vp9_loopfilter.c: make some functions static
This commit is contained in:
Коммит
98e132bde0
|
@ -33,8 +33,7 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
|
|||
lfi->mode_lf_lut[NEWMV] = 1;
|
||||
}
|
||||
|
||||
void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
|
||||
int sharpness_lvl) {
|
||||
static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
|
||||
int lvl;
|
||||
|
||||
// For each possible value for the loop filter fill out limits
|
||||
|
@ -62,7 +61,7 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
|
|||
int i;
|
||||
|
||||
// init limits for given sharpness
|
||||
vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
|
||||
update_sharpness(lfi, cm->sharpness_level);
|
||||
cm->last_sharpness_level = cm->sharpness_level;
|
||||
|
||||
// init LUT for lvl and hev thr picking
|
||||
|
@ -73,8 +72,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
|
|||
vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
|
||||
}
|
||||
|
||||
void vp9_loop_filter_frame_init(VP9_COMMON *cm, MACROBLOCKD *xd,
|
||||
int default_filt_lvl) {
|
||||
static void loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
|
||||
int default_filt_lvl) {
|
||||
int seg;
|
||||
// n_shift is the a multiplier for lf_deltas
|
||||
// the multiplier is 1 for when filter_lvl is between 0 and 31;
|
||||
|
@ -84,7 +83,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, MACROBLOCKD *xd,
|
|||
|
||||
// update limits if sharpness has changed
|
||||
if (cm->last_sharpness_level != cm->sharpness_level) {
|
||||
vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
|
||||
update_sharpness(lfi, cm->sharpness_level);
|
||||
cm->last_sharpness_level = cm->sharpness_level;
|
||||
}
|
||||
|
||||
|
@ -118,9 +117,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, MACROBLOCKD *xd,
|
|||
}
|
||||
}
|
||||
|
||||
static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi,
|
||||
struct loop_filter_info *lfi) {
|
||||
const loop_filter_info_n *const lfi_n = &cm->lf_info;
|
||||
static int build_lfi(const loop_filter_info_n *const lfi_n,
|
||||
const MB_MODE_INFO *const mbmi,
|
||||
struct loop_filter_info *const lfi) {
|
||||
const int seg = mbmi->segment_id;
|
||||
const int ref = mbmi->ref_frame[0];
|
||||
const int mode = lfi_n->mode_lf_lut[mbmi->mode];
|
||||
|
@ -231,13 +230,13 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
|
|||
}
|
||||
}
|
||||
|
||||
static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
|
||||
static void filter_block_plane(VP9_COMMON *const cm, MACROBLOCKD *const xd,
|
||||
int plane, int mi_row, int mi_col) {
|
||||
const int ss_x = xd->plane[plane].subsampling_x;
|
||||
const int ss_y = xd->plane[plane].subsampling_y;
|
||||
const int row_step = 1 << xd->plane[plane].subsampling_y;
|
||||
const int col_step = 1 << xd->plane[plane].subsampling_x;
|
||||
struct buf_2d * const dst = &xd->plane[plane].dst;
|
||||
const int row_step = 1 << ss_x;
|
||||
const int col_step = 1 << ss_y;
|
||||
struct buf_2d *const dst = &xd->plane[plane].dst;
|
||||
uint8_t* const dst0 = dst->buf;
|
||||
unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
|
||||
unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
|
||||
|
@ -245,8 +244,8 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
|
|||
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
|
||||
struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
|
||||
int r, c;
|
||||
MODE_INFO *mi = xd->mode_info_context;
|
||||
int row_step_stride = cm->mode_info_stride * row_step;
|
||||
const MODE_INFO *mi = xd->mode_info_context;
|
||||
const int row_step_stride = cm->mode_info_stride * row_step;
|
||||
|
||||
for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
|
||||
unsigned int mask_16x16_c = 0;
|
||||
|
@ -272,8 +271,7 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
|
|||
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
|
||||
|
||||
// Filter level can vary per MI
|
||||
if (!build_lfi(cm, &mi[c].mbmi,
|
||||
lfi[r] + (c >> xd->plane[plane].subsampling_x)))
|
||||
if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
|
||||
continue;
|
||||
|
||||
// Build masks based on the transform size of each block
|
||||
|
@ -355,7 +353,7 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
|
|||
int mi_row, mi_col;
|
||||
|
||||
// Initialize the loop filter for this frame.
|
||||
vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
|
||||
loop_filter_frame_init(cm, xd, frame_filter_level);
|
||||
|
||||
for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
|
||||
MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
|
||||
|
|
|
@ -44,44 +44,14 @@ struct loop_filter_info {
|
|||
const uint8_t *hev_thr;
|
||||
};
|
||||
|
||||
#define prototype_loopfilter(sym) \
|
||||
void sym(uint8_t *src, int pitch, const uint8_t *blimit, \
|
||||
const uint8_t *limit, const uint8_t *thresh, int count)
|
||||
|
||||
#define prototype_loopfilter_block(sym) \
|
||||
void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
|
||||
int ystride, int uv_stride, struct loop_filter_info *lfi)
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
#include "x86/vp9_loopfilter_x86.h"
|
||||
#endif
|
||||
|
||||
typedef void loop_filter_uvfunction(uint8_t *src, int pitch,
|
||||
const uint8_t *blimit,
|
||||
const uint8_t *limit,
|
||||
const uint8_t *thresh,
|
||||
uint8_t *v);
|
||||
|
||||
/* assorted loopfilter functions which get used elsewhere */
|
||||
struct VP9Common;
|
||||
struct macroblockd;
|
||||
|
||||
void vp9_loop_filter_init(struct VP9Common *cm);
|
||||
|
||||
void vp9_loop_filter_frame_init(struct VP9Common *cm,
|
||||
struct macroblockd *mbd,
|
||||
int default_filt_lvl);
|
||||
|
||||
void vp9_loop_filter_frame(struct VP9Common *cm,
|
||||
struct macroblockd *mbd,
|
||||
int filter_level,
|
||||
int y_only);
|
||||
|
||||
void vp9_loop_filter_partial_frame(struct VP9Common *cm,
|
||||
struct macroblockd *mbd,
|
||||
int default_filt_lvl);
|
||||
|
||||
void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
|
||||
int sharpness_lvl);
|
||||
|
||||
#endif // VP9_COMMON_VP9_LOOPFILTER_H_
|
||||
|
|
|
@ -12,17 +12,11 @@
|
|||
#include "vp9/common/vp9_loopfilter.h"
|
||||
#include "vpx_ports/emmintrin_compat.h"
|
||||
|
||||
prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
|
||||
prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
|
||||
|
||||
extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
|
||||
extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
|
||||
|
||||
void vp9_mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *_blimit,
|
||||
const unsigned char *_limit,
|
||||
const unsigned char *_thresh) {
|
||||
static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *_blimit,
|
||||
const unsigned char *_limit,
|
||||
const unsigned char *_thresh) {
|
||||
DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
|
||||
DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
|
||||
|
||||
|
@ -483,11 +477,11 @@ void vp9_mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
|
|||
}
|
||||
}
|
||||
|
||||
void vp9_mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *_blimit,
|
||||
const unsigned char *_limit,
|
||||
const unsigned char *_thresh) {
|
||||
static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *_blimit,
|
||||
const unsigned char *_limit,
|
||||
const unsigned char *_thresh) {
|
||||
DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
|
||||
DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
|
||||
|
||||
|
@ -962,9 +956,9 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
|
|||
const unsigned char *_thresh,
|
||||
int count) {
|
||||
if (count == 1)
|
||||
vp9_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
|
||||
mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
|
||||
else
|
||||
vp9_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
|
||||
mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
|
||||
}
|
||||
|
||||
void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
|
||||
|
@ -1206,79 +1200,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
|
|||
}
|
||||
}
|
||||
|
||||
void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
|
||||
int p,
|
||||
const unsigned char *_blimit,
|
||||
const unsigned char *_limit,
|
||||
const unsigned char *_thresh,
|
||||
unsigned char *v) {
|
||||
DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160);
|
||||
|
||||
/* Read source */
|
||||
const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v - 5 * p)));
|
||||
const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v - 4 * p)));
|
||||
const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v - 3 * p)));
|
||||
const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v - 2 * p)));
|
||||
const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v - 1 * p)));
|
||||
const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)),
|
||||
_mm_loadl_epi64((__m128i *)(v)));
|
||||
const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v + 1 * p)));
|
||||
const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v + 2 * p)));
|
||||
const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v + 3 * p)));
|
||||
const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(v + 4 * p)));
|
||||
|
||||
_mm_store_si128((__m128i *)(src), p4);
|
||||
_mm_store_si128((__m128i *)(src + 16), p3);
|
||||
_mm_store_si128((__m128i *)(src + 32), p2);
|
||||
_mm_store_si128((__m128i *)(src + 48), p1);
|
||||
_mm_store_si128((__m128i *)(src + 64), p0);
|
||||
_mm_store_si128((__m128i *)(src + 80), q0);
|
||||
_mm_store_si128((__m128i *)(src + 96), q1);
|
||||
_mm_store_si128((__m128i *)(src + 112), q2);
|
||||
_mm_store_si128((__m128i *)(src + 128), q3);
|
||||
_mm_store_si128((__m128i *)(src + 144), q4);
|
||||
|
||||
/* Loop filtering */
|
||||
vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit,
|
||||
_thresh, 1);
|
||||
|
||||
/* Store result */
|
||||
_mm_storel_epi64((__m128i *)(u - 3 * p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 32)));
|
||||
_mm_storel_epi64((__m128i *)(u - 2 * p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 48)));
|
||||
_mm_storel_epi64((__m128i *)(u - p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 64)));
|
||||
_mm_storel_epi64((__m128i *)u,
|
||||
_mm_loadl_epi64((__m128i *)(src + 80)));
|
||||
_mm_storel_epi64((__m128i *)(u + p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 96)));
|
||||
_mm_storel_epi64((__m128i *)(u + 2 * p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 112)));
|
||||
|
||||
_mm_storel_epi64((__m128i *)(v - 3 * p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 40)));
|
||||
_mm_storel_epi64((__m128i *)(v - 2 * p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 56)));
|
||||
_mm_storel_epi64((__m128i *)(v - p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 72)));
|
||||
_mm_storel_epi64((__m128i *)v,
|
||||
_mm_loadl_epi64((__m128i *)(src + 88)));
|
||||
_mm_storel_epi64((__m128i *)(v + p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 104)));
|
||||
_mm_storel_epi64((__m128i *)(v + 2 * p),
|
||||
_mm_loadl_epi64((__m128i *)(src + 120)));
|
||||
}
|
||||
|
||||
static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
|
||||
int in_p, unsigned char *out, int out_p) {
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
@ -1425,7 +1346,7 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
|
|||
|
||||
/* Loop filtering */
|
||||
vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
|
||||
thresh, 1);
|
||||
thresh, 1);
|
||||
src[0] = t_dst + 3 * 16;
|
||||
src[1] = t_dst + 3 * 16 + 8;
|
||||
|
||||
|
@ -1437,10 +1358,10 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
|
|||
}
|
||||
|
||||
void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *blimit,
|
||||
const unsigned char *limit,
|
||||
const unsigned char *thresh) {
|
||||
int p,
|
||||
const unsigned char *blimit,
|
||||
const unsigned char *limit,
|
||||
const unsigned char *thresh) {
|
||||
DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
|
||||
unsigned char *src[4];
|
||||
unsigned char *dst[4];
|
||||
|
@ -1466,32 +1387,3 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
|
|||
|
||||
transpose(src, 16, dst, p, 2);
|
||||
}
|
||||
|
||||
|
||||
void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
|
||||
int p,
|
||||
const unsigned char *blimit,
|
||||
const unsigned char *limit,
|
||||
const unsigned char *thresh,
|
||||
unsigned char *v) {
|
||||
DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
|
||||
unsigned char *src[2];
|
||||
unsigned char *dst[2];
|
||||
|
||||
/* Transpose 16x16 */
|
||||
transpose8x16(u - 8, v - 8, p, t_dst, 16);
|
||||
transpose8x16(u, v, p, t_dst + 16 * 8, 16);
|
||||
|
||||
/* Loop filtering */
|
||||
vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
|
||||
thresh, 1);
|
||||
|
||||
src[0] = t_dst + 3 * 16;
|
||||
src[1] = t_dst + 3 * 16 + 8;
|
||||
|
||||
dst[0] = u - 5;
|
||||
dst[1] = v - 5;
|
||||
|
||||
/* Transpose 16x8 */
|
||||
transpose(src, 16, dst, p, 2);
|
||||
}
|
||||
|
|
|
@ -1,872 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
; Use of pmaxub instead of psubusb to compute filter mask was seen
|
||||
; in ffvp8
|
||||
|
||||
%macro LFH_FILTER_AND_HEV_MASK 1
|
||||
%if %1
|
||||
movdqa xmm2, [rdi+2*rax] ; q3
|
||||
movdqa xmm1, [rsi+2*rax] ; q2
|
||||
movdqa xmm4, [rsi+rax] ; q1
|
||||
movdqa xmm5, [rsi] ; q0
|
||||
neg rax ; negate pitch to deal with above border
|
||||
%else
|
||||
movlps xmm2, [rsi + rcx*2] ; q3
|
||||
movlps xmm1, [rsi + rcx] ; q2
|
||||
movlps xmm4, [rsi] ; q1
|
||||
movlps xmm5, [rsi + rax] ; q0
|
||||
|
||||
movhps xmm2, [rdi + rcx*2]
|
||||
movhps xmm1, [rdi + rcx]
|
||||
movhps xmm4, [rdi]
|
||||
movhps xmm5, [rdi + rax]
|
||||
|
||||
lea rsi, [rsi + rax*4]
|
||||
lea rdi, [rdi + rax*4]
|
||||
|
||||
movdqa XMMWORD PTR [rsp], xmm1 ; store q2
|
||||
movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
|
||||
%endif
|
||||
|
||||
movdqa xmm6, xmm1 ; q2
|
||||
movdqa xmm3, xmm4 ; q1
|
||||
|
||||
psubusb xmm1, xmm2 ; q2-=q3
|
||||
psubusb xmm2, xmm6 ; q3-=q2
|
||||
|
||||
psubusb xmm4, xmm6 ; q1-=q2
|
||||
psubusb xmm6, xmm3 ; q2-=q1
|
||||
|
||||
por xmm4, xmm6 ; abs(q2-q1)
|
||||
por xmm1, xmm2 ; abs(q3-q2)
|
||||
|
||||
movdqa xmm0, xmm5 ; q0
|
||||
pmaxub xmm1, xmm4
|
||||
|
||||
psubusb xmm5, xmm3 ; q0-=q1
|
||||
psubusb xmm3, xmm0 ; q1-=q0
|
||||
|
||||
por xmm5, xmm3 ; abs(q0-q1)
|
||||
movdqa t0, xmm5 ; save to t0
|
||||
|
||||
pmaxub xmm1, xmm5
|
||||
|
||||
%if %1
|
||||
movdqa xmm2, [rsi+4*rax] ; p3
|
||||
movdqa xmm4, [rdi+4*rax] ; p2
|
||||
movdqa xmm6, [rsi+2*rax] ; p1
|
||||
%else
|
||||
movlps xmm2, [rsi + rax] ; p3
|
||||
movlps xmm4, [rsi] ; p2
|
||||
movlps xmm6, [rsi + rcx] ; p1
|
||||
|
||||
movhps xmm2, [rdi + rax]
|
||||
movhps xmm4, [rdi]
|
||||
movhps xmm6, [rdi + rcx]
|
||||
|
||||
movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
|
||||
movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
|
||||
%endif
|
||||
|
||||
movdqa xmm5, xmm4 ; p2
|
||||
movdqa xmm3, xmm6 ; p1
|
||||
|
||||
psubusb xmm4, xmm2 ; p2-=p3
|
||||
psubusb xmm2, xmm5 ; p3-=p2
|
||||
|
||||
psubusb xmm3, xmm5 ; p1-=p2
|
||||
pmaxub xmm1, xmm4 ; abs(p3 - p2)
|
||||
|
||||
psubusb xmm5, xmm6 ; p2-=p1
|
||||
pmaxub xmm1, xmm2 ; abs(p3 - p2)
|
||||
|
||||
pmaxub xmm1, xmm5 ; abs(p2 - p1)
|
||||
movdqa xmm2, xmm6 ; p1
|
||||
|
||||
pmaxub xmm1, xmm3 ; abs(p2 - p1)
|
||||
%if %1
|
||||
movdqa xmm4, [rsi+rax] ; p0
|
||||
movdqa xmm3, [rdi] ; q1
|
||||
%else
|
||||
movlps xmm4, [rsi + rcx*2] ; p0
|
||||
movhps xmm4, [rdi + rcx*2]
|
||||
movdqa xmm3, q1 ; q1
|
||||
%endif
|
||||
|
||||
movdqa xmm5, xmm4 ; p0
|
||||
psubusb xmm4, xmm6 ; p0-=p1
|
||||
|
||||
psubusb xmm6, xmm5 ; p1-=p0
|
||||
|
||||
por xmm6, xmm4 ; abs(p1 - p0)
|
||||
mov rdx, arg(2) ; get blimit
|
||||
|
||||
movdqa t1, xmm6 ; save to t1
|
||||
|
||||
movdqa xmm4, xmm3 ; q1
|
||||
pmaxub xmm1, xmm6
|
||||
|
||||
psubusb xmm3, xmm2 ; q1-=p1
|
||||
psubusb xmm2, xmm4 ; p1-=q1
|
||||
|
||||
psubusb xmm1, xmm7
|
||||
por xmm2, xmm3 ; abs(p1-q1)
|
||||
|
||||
movdqa xmm7, XMMWORD PTR [rdx] ; blimit
|
||||
|
||||
movdqa xmm3, xmm0 ; q0
|
||||
pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
|
||||
|
||||
mov rdx, arg(4) ; hev get thresh
|
||||
|
||||
movdqa xmm6, xmm5 ; p0
|
||||
psrlw xmm2, 1 ; abs(p1-q1)/2
|
||||
|
||||
psubusb xmm5, xmm3 ; p0-=q0
|
||||
|
||||
psubusb xmm3, xmm6 ; q0-=p0
|
||||
por xmm5, xmm3 ; abs(p0 - q0)
|
||||
|
||||
paddusb xmm5, xmm5 ; abs(p0-q0)*2
|
||||
|
||||
movdqa xmm4, t0 ; hev get abs (q1 - q0)
|
||||
|
||||
movdqa xmm3, t1 ; get abs (p1 - p0)
|
||||
|
||||
paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
||||
|
||||
movdqa xmm2, XMMWORD PTR [rdx] ; hev
|
||||
|
||||
psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
||||
psubusb xmm4, xmm2 ; hev
|
||||
|
||||
psubusb xmm3, xmm2 ; hev
|
||||
por xmm1, xmm5
|
||||
|
||||
pxor xmm7, xmm7
|
||||
paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
|
||||
|
||||
pcmpeqb xmm4, xmm5 ; hev
|
||||
pcmpeqb xmm3, xmm3 ; hev
|
||||
|
||||
pcmpeqb xmm1, xmm7 ; mask xmm1
|
||||
pxor xmm4, xmm3 ; hev
|
||||
%endmacro
|
||||
|
||||
%macro B_FILTER 1
|
||||
%if %1 == 0
|
||||
movdqa xmm2, p1 ; p1
|
||||
movdqa xmm7, q1 ; q1
|
||||
%elif %1 == 1
|
||||
movdqa xmm2, [rsi+2*rax] ; p1
|
||||
movdqa xmm7, [rdi] ; q1
|
||||
%elif %1 == 2
|
||||
lea rdx, srct
|
||||
|
||||
movdqa xmm2, [rdx] ; p1
|
||||
movdqa xmm7, [rdx+48] ; q1
|
||||
movdqa xmm6, [rdx+16] ; p0
|
||||
movdqa xmm0, [rdx+32] ; q0
|
||||
%endif
|
||||
|
||||
pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
|
||||
pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
|
||||
|
||||
psubsb xmm2, xmm7 ; p1 - q1
|
||||
pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
|
||||
|
||||
pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
|
||||
pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
|
||||
|
||||
movdqa xmm3, xmm0 ; q0
|
||||
psubsb xmm0, xmm6 ; q0 - p0
|
||||
|
||||
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
|
||||
|
||||
paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
|
||||
|
||||
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
|
||||
|
||||
pand xmm1, xmm2 ; mask filter values we don't care about
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
|
||||
paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
|
||||
|
||||
punpckhbw xmm5, xmm2 ; axbxcxdx
|
||||
punpcklbw xmm2, xmm2 ; exfxgxhx
|
||||
|
||||
punpcklbw xmm0, xmm1 ; exfxgxhx
|
||||
psraw xmm5, 11 ; sign extended shift right by 3
|
||||
|
||||
punpckhbw xmm1, xmm1 ; axbxcxdx
|
||||
psraw xmm2, 11 ; sign extended shift right by 3
|
||||
|
||||
packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
|
||||
psraw xmm0, 11 ; sign extended shift right by 3
|
||||
|
||||
psraw xmm1, 11 ; sign extended shift right by 3
|
||||
movdqa xmm5, xmm0 ; save results
|
||||
|
||||
packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
|
||||
paddsw xmm5, [GLOBAL(ones)]
|
||||
|
||||
paddsw xmm1, [GLOBAL(ones)]
|
||||
psraw xmm5, 1 ; partial shifted one more time for 2nd tap
|
||||
|
||||
psraw xmm1, 1 ; partial shifted one more time for 2nd tap
|
||||
|
||||
paddsb xmm6, xmm2 ; p0+= p0 add
|
||||
packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
|
||||
|
||||
%if %1 == 0
|
||||
movdqa xmm1, p1 ; p1
|
||||
%elif %1 == 1
|
||||
movdqa xmm1, [rsi+2*rax] ; p1
|
||||
%elif %1 == 2
|
||||
movdqa xmm1, [rdx] ; p1
|
||||
%endif
|
||||
pandn xmm4, xmm5 ; high edge variance additive
|
||||
pxor xmm6, [GLOBAL(t80)] ; unoffset
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ; reoffset
|
||||
psubsb xmm3, xmm0 ; q0-= q0 add
|
||||
|
||||
paddsb xmm1, xmm4 ; p1+= p1 add
|
||||
pxor xmm3, [GLOBAL(t80)] ; unoffset
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ; unoffset
|
||||
psubsb xmm7, xmm4 ; q1-= q1 add
|
||||
|
||||
pxor xmm7, [GLOBAL(t80)] ; unoffset
|
||||
%if %1 == 0
|
||||
lea rsi, [rsi + rcx*2]
|
||||
lea rdi, [rdi + rcx*2]
|
||||
movq MMWORD PTR [rsi], xmm6 ; p0
|
||||
movhps MMWORD PTR [rdi], xmm6
|
||||
movq MMWORD PTR [rsi + rax], xmm1 ; p1
|
||||
movhps MMWORD PTR [rdi + rax], xmm1
|
||||
movq MMWORD PTR [rsi + rcx], xmm3 ; q0
|
||||
movhps MMWORD PTR [rdi + rcx], xmm3
|
||||
movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
|
||||
movhps MMWORD PTR [rdi + rcx*2],xmm7
|
||||
%elif %1 == 1
|
||||
movdqa [rsi+rax], xmm6 ; write back
|
||||
movdqa [rsi+2*rax], xmm1 ; write back
|
||||
movdqa [rsi], xmm3 ; write back
|
||||
movdqa [rdi], xmm7 ; write back
|
||||
%endif
|
||||
|
||||
%endmacro
|
||||
|
||||
|
||||
;void vp9_loop_filter_horizontal_edge_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixel_step,
|
||||
; const char *blimit,
|
||||
; const char *limit,
|
||||
; const char *thresh,
|
||||
; int count
|
||||
;)
|
||||
global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE
|
||||
sym(vp9_loop_filter_horizontal_edge_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 32 ; reserve 32 bytes
|
||||
%define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
|
||||
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
movsxd rax, dword ptr arg(1) ;src_pixel_step
|
||||
|
||||
mov rdx, arg(3) ;limit
|
||||
movdqa xmm7, XMMWORD PTR [rdx]
|
||||
|
||||
lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
|
||||
|
||||
; calculate breakout conditions and high edge variance
|
||||
LFH_FILTER_AND_HEV_MASK 1
|
||||
; filter and write back the result
|
||||
B_FILTER 1
|
||||
|
||||
add rsp, 32
|
||||
pop rsp
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp9_loop_filter_horizontal_edge_uv_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixel_step,
|
||||
; const char *blimit,
|
||||
; const char *limit,
|
||||
; const char *thresh,
|
||||
; int count
|
||||
;)
|
||||
global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE
|
||||
sym(vp9_loop_filter_horizontal_edge_uv_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 96 ; reserve 96 bytes
|
||||
%define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
|
||||
%define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
|
||||
%define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
|
||||
%define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
|
||||
%define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
|
||||
%define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
|
||||
|
||||
mov rsi, arg(0) ; u
|
||||
mov rdi, arg(5) ; v
|
||||
movsxd rax, dword ptr arg(1) ; src_pixel_step
|
||||
mov rcx, rax
|
||||
neg rax ; negate pitch to deal with above border
|
||||
|
||||
mov rdx, arg(3) ;limit
|
||||
movdqa xmm7, XMMWORD PTR [rdx]
|
||||
|
||||
lea rsi, [rsi + rcx]
|
||||
lea rdi, [rdi + rcx]
|
||||
|
||||
; calculate breakout conditions and high edge variance
|
||||
LFH_FILTER_AND_HEV_MASK 0
|
||||
; filter and write back the result
|
||||
B_FILTER 0
|
||||
|
||||
add rsp, 96
|
||||
pop rsp
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
%macro TRANSPOSE_16X8 2
|
||||
movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
|
||||
movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
|
||||
movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
|
||||
movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
|
||||
movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
|
||||
movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
|
||||
|
||||
punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
|
||||
|
||||
movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
|
||||
|
||||
movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
|
||||
punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
|
||||
|
||||
movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
|
||||
|
||||
punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
|
||||
%if %1
|
||||
lea rsi, [rsi+rax*8]
|
||||
%else
|
||||
mov rsi, arg(5) ; v_ptr
|
||||
%endif
|
||||
|
||||
movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
|
||||
punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
|
||||
|
||||
punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
|
||||
|
||||
punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
|
||||
%if %1
|
||||
lea rdi, [rdi+rax*8]
|
||||
%else
|
||||
lea rsi, [rsi - 4]
|
||||
%endif
|
||||
|
||||
punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
|
||||
%if %1
|
||||
lea rdx, srct
|
||||
%else
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
%endif
|
||||
|
||||
movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
|
||||
punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
|
||||
|
||||
movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
|
||||
punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
|
||||
|
||||
punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
|
||||
|
||||
punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
|
||||
|
||||
punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
|
||||
|
||||
movdqa t0, xmm2 ; save to free XMM2
|
||||
movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
|
||||
movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
|
||||
movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
|
||||
movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
|
||||
movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
|
||||
|
||||
punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
|
||||
|
||||
movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
|
||||
|
||||
punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
|
||||
|
||||
movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
|
||||
|
||||
punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
|
||||
|
||||
movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
|
||||
|
||||
punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
|
||||
|
||||
movdqa xmm6, xmm1 ;
|
||||
punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
|
||||
|
||||
punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
|
||||
movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
|
||||
|
||||
punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
|
||||
|
||||
punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
|
||||
|
||||
punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
|
||||
movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
|
||||
|
||||
punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
|
||||
|
||||
punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
|
||||
movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
|
||||
|
||||
punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
|
||||
|
||||
punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
|
||||
%if %2
|
||||
movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
|
||||
punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
|
||||
|
||||
punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
|
||||
|
||||
movdqa [rdx], xmm2 ; save 2
|
||||
|
||||
movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
|
||||
punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
|
||||
|
||||
movdqa [rdx+16], xmm3 ; save 3
|
||||
|
||||
punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
|
||||
|
||||
movdqa [rdx+32], xmm4 ; save 4
|
||||
movdqa [rdx+48], xmm5 ; save 5
|
||||
movdqa xmm1, t0 ; get
|
||||
|
||||
movdqa xmm2, xmm1 ;
|
||||
punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
|
||||
|
||||
punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
|
||||
%else
|
||||
movdqa [rdx+112], xmm7 ; save 7
|
||||
|
||||
movdqa [rdx+96], xmm6 ; save 6
|
||||
|
||||
movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
|
||||
punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
|
||||
|
||||
punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
|
||||
|
||||
movdqa [rdx+32], xmm2 ; save 2
|
||||
|
||||
movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
|
||||
punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
|
||||
|
||||
movdqa [rdx+48], xmm3 ; save 3
|
||||
|
||||
punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
|
||||
|
||||
movdqa [rdx+64], xmm4 ; save 4
|
||||
movdqa [rdx+80], xmm5 ; save 5
|
||||
movdqa xmm1, t0 ; get
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
|
||||
|
||||
punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
|
||||
|
||||
movdqa [rdx+16], xmm1
|
||||
|
||||
movdqa [rdx], xmm2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LFV_FILTER_MASK_HEV_MASK 1
|
||||
movdqa xmm0, xmm6 ; q2
|
||||
psubusb xmm0, xmm7 ; q2-q3
|
||||
|
||||
psubusb xmm7, xmm6 ; q3-q2
|
||||
movdqa xmm4, xmm5 ; q1
|
||||
|
||||
por xmm7, xmm0 ; abs (q3-q2)
|
||||
psubusb xmm4, xmm6 ; q1-q2
|
||||
|
||||
movdqa xmm0, xmm1
|
||||
psubusb xmm6, xmm5 ; q2-q1
|
||||
|
||||
por xmm6, xmm4 ; abs (q2-q1)
|
||||
psubusb xmm0, xmm2 ; p2 - p3;
|
||||
|
||||
psubusb xmm2, xmm1 ; p3 - p2;
|
||||
por xmm0, xmm2 ; abs(p2-p3)
|
||||
%if %1
|
||||
movdqa xmm2, [rdx] ; p1
|
||||
%else
|
||||
movdqa xmm2, [rdx+32] ; p1
|
||||
%endif
|
||||
movdqa xmm5, xmm2 ; p1
|
||||
pmaxub xmm0, xmm7
|
||||
|
||||
psubusb xmm5, xmm1 ; p1-p2
|
||||
psubusb xmm1, xmm2 ; p2-p1
|
||||
|
||||
movdqa xmm7, xmm3 ; p0
|
||||
psubusb xmm7, xmm2 ; p0-p1
|
||||
|
||||
por xmm1, xmm5 ; abs(p2-p1)
|
||||
pmaxub xmm0, xmm6
|
||||
|
||||
pmaxub xmm0, xmm1
|
||||
movdqa xmm1, xmm2 ; p1
|
||||
|
||||
psubusb xmm2, xmm3 ; p1-p0
|
||||
lea rdx, srct
|
||||
|
||||
por xmm2, xmm7 ; abs(p1-p0)
|
||||
|
||||
movdqa t0, xmm2 ; save abs(p1-p0)
|
||||
|
||||
pmaxub xmm0, xmm2
|
||||
|
||||
%if %1
|
||||
movdqa xmm5, [rdx+32] ; q0
|
||||
movdqa xmm7, [rdx+48] ; q1
|
||||
%else
|
||||
movdqa xmm5, [rdx+64] ; q0
|
||||
movdqa xmm7, [rdx+80] ; q1
|
||||
%endif
|
||||
mov rdx, arg(3) ; limit
|
||||
|
||||
movdqa xmm6, xmm5 ; q0
|
||||
movdqa xmm2, xmm7 ; q1
|
||||
|
||||
psubusb xmm5, xmm7 ; q0-q1
|
||||
psubusb xmm7, xmm6 ; q1-q0
|
||||
|
||||
por xmm7, xmm5 ; abs(q1-q0)
|
||||
|
||||
movdqa t1, xmm7 ; save abs(q1-q0)
|
||||
|
||||
movdqa xmm4, XMMWORD PTR [rdx]; limit
|
||||
|
||||
pmaxub xmm0, xmm7
|
||||
mov rdx, arg(2) ; blimit
|
||||
|
||||
psubusb xmm0, xmm4
|
||||
movdqa xmm5, xmm2 ; q1
|
||||
|
||||
psubusb xmm5, xmm1 ; q1-=p1
|
||||
psubusb xmm1, xmm2 ; p1-=q1
|
||||
|
||||
por xmm5, xmm1 ; abs(p1-q1)
|
||||
movdqa xmm1, xmm3 ; p0
|
||||
|
||||
pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
|
||||
psubusb xmm1, xmm6 ; p0-q0
|
||||
|
||||
psrlw xmm5, 1 ; abs(p1-q1)/2
|
||||
psubusb xmm6, xmm3 ; q0-p0
|
||||
|
||||
movdqa xmm4, XMMWORD PTR [rdx]; blimit
|
||||
|
||||
mov rdx, arg(4) ; get thresh
|
||||
|
||||
por xmm1, xmm6 ; abs(q0-p0)
|
||||
|
||||
movdqa xmm6, t0 ; get abs (q1 - q0)
|
||||
|
||||
paddusb xmm1, xmm1 ; abs(q0-p0)*2
|
||||
|
||||
movdqa xmm3, t1 ; get abs (p1 - p0)
|
||||
|
||||
movdqa xmm7, XMMWORD PTR [rdx]
|
||||
|
||||
paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
|
||||
psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
|
||||
|
||||
psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
|
||||
|
||||
psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
|
||||
por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
|
||||
|
||||
por xmm1, xmm0 ; mask
|
||||
pcmpeqb xmm6, xmm0
|
||||
|
||||
pxor xmm0, xmm0
|
||||
pcmpeqb xmm4, xmm4
|
||||
|
||||
pcmpeqb xmm1, xmm0
|
||||
pxor xmm4, xmm6
|
||||
%endmacro
|
||||
|
||||
%macro BV_TRANSPOSE 0
|
||||
; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
|
||||
; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
|
||||
; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
|
||||
; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
|
||||
movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
|
||||
punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
|
||||
|
||||
movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
|
||||
punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
|
||||
|
||||
punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
|
||||
|
||||
punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
|
||||
|
||||
movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
|
||||
punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
|
||||
|
||||
punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
|
||||
movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
|
||||
|
||||
punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
|
||||
|
||||
punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
|
||||
; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
|
||||
; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
|
||||
; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
|
||||
; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
|
||||
%endmacro
|
||||
|
||||
%macro BV_WRITEBACK 2
|
||||
movd [rsi+2], %1
|
||||
psrldq %1, 4
|
||||
|
||||
movd [rdi+2], %1
|
||||
psrldq %1, 4
|
||||
|
||||
movd [rsi+2*rax+2], %1
|
||||
psrldq %1, 4
|
||||
|
||||
movd [rdi+2*rax+2], %1
|
||||
|
||||
movd [rsi+4*rax+2], %2
|
||||
psrldq %2, 4
|
||||
|
||||
movd [rdi+4*rax+2], %2
|
||||
psrldq %2, 4
|
||||
|
||||
movd [rsi+2*rcx+2], %2
|
||||
psrldq %2, 4
|
||||
|
||||
movd [rdi+2*rcx+2], %2
|
||||
%endmacro
|
||||
|
||||
|
||||
;void vp9_loop_filter_vertical_edge_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixel_step,
|
||||
; const char *blimit,
|
||||
; const char *limit,
|
||||
; const char *thresh,
|
||||
; int count
|
||||
;)
|
||||
global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE
|
||||
sym(vp9_loop_filter_vertical_edge_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 96 ; reserve 96 bytes
|
||||
%define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
|
||||
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
|
||||
%define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
|
||||
|
||||
mov rsi, arg(0) ; src_ptr
|
||||
movsxd rax, dword ptr arg(1) ; src_pixel_step
|
||||
|
||||
lea rsi, [rsi - 4]
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
lea rcx, [rax*2+rax]
|
||||
|
||||
;transpose 16x8 to 8x16, and store the 8-line result on stack.
|
||||
TRANSPOSE_16X8 1, 1
|
||||
|
||||
; calculate filter mask and high edge variance
|
||||
LFV_FILTER_MASK_HEV_MASK 1
|
||||
|
||||
; start work on filters
|
||||
B_FILTER 2
|
||||
|
||||
; tranpose and write back - only work on q1, q0, p0, p1
|
||||
BV_TRANSPOSE
|
||||
; store 16-line result
|
||||
|
||||
lea rdx, [rax]
|
||||
neg rdx
|
||||
|
||||
BV_WRITEBACK xmm1, xmm5
|
||||
|
||||
lea rsi, [rsi+rdx*8]
|
||||
lea rdi, [rdi+rdx*8]
|
||||
BV_WRITEBACK xmm2, xmm6
|
||||
|
||||
add rsp, 96
|
||||
pop rsp
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp9_loop_filter_vertical_edge_uv_sse2
|
||||
;(
|
||||
; unsigned char *u,
|
||||
; int src_pixel_step,
|
||||
; const char *blimit,
|
||||
; const char *limit,
|
||||
; const char *thresh,
|
||||
; unsigned char *v
|
||||
;)
|
||||
global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE
|
||||
sym(vp9_loop_filter_vertical_edge_uv_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 96 ; reserve 96 bytes
|
||||
%define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
|
||||
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
|
||||
%define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
|
||||
|
||||
mov rsi, arg(0) ; u_ptr
|
||||
movsxd rax, dword ptr arg(1) ; src_pixel_step
|
||||
|
||||
lea rsi, [rsi - 4]
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
lea rcx, [rax+2*rax]
|
||||
|
||||
lea rdx, srct
|
||||
|
||||
;transpose 16x8 to 8x16, and store the 8-line result on stack.
|
||||
TRANSPOSE_16X8 0, 1
|
||||
|
||||
; calculate filter mask and high edge variance
|
||||
LFV_FILTER_MASK_HEV_MASK 1
|
||||
|
||||
; start work on filters
|
||||
B_FILTER 2
|
||||
|
||||
; tranpose and write back - only work on q1, q0, p0, p1
|
||||
BV_TRANSPOSE
|
||||
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
|
||||
; store 16-line result
|
||||
BV_WRITEBACK xmm1, xmm5
|
||||
|
||||
mov rsi, arg(0) ; u_ptr
|
||||
lea rsi, [rsi - 4]
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
BV_WRITEBACK xmm2, xmm6
|
||||
|
||||
add rsp, 96
|
||||
pop rsp
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
tfe:
|
||||
times 16 db 0xfe
|
||||
align 16
|
||||
t80:
|
||||
times 16 db 0x80
|
||||
align 16
|
||||
t1s:
|
||||
times 16 db 0x01
|
||||
align 16
|
||||
t3:
|
||||
times 16 db 0x03
|
||||
align 16
|
||||
t4:
|
||||
times 16 db 0x04
|
||||
align 16
|
||||
ones:
|
||||
times 8 dw 0x0001
|
||||
align 16
|
||||
s9:
|
||||
times 8 dw 0x0900
|
||||
align 16
|
||||
s63:
|
||||
times 8 dw 0x003f
|
|
@ -1,35 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
|
||||
#define VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
|
||||
|
||||
/* Note:
|
||||
*
|
||||
* This platform is commonly built for runtime CPU detection. If you modify
|
||||
* any of the function mappings present in this file, be sure to also update
|
||||
* them in the function pointer initialization code
|
||||
*/
|
||||
|
||||
#if HAVE_MMX
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
|
||||
#endif
|
||||
|
||||
#endif // LOOPFILTER_X86_H
|
|
@ -68,7 +68,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
|
|||
VP9_COMMON_SRCS-yes += common/vp9_common_data.c
|
||||
VP9_COMMON_SRCS-yes += common/vp9_common_data.h
|
||||
|
||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
|
||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
|
||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
|
||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
|
||||
|
@ -76,7 +75,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
|
|||
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
|
||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
|
||||
|
|
Загрузка…
Ссылка в новой задаче