SSSE3 version of fast quantizer

(test clip: tulip) For good quality mode with speed=1, this gave the encoder a small (2 - 3%) performance boost. Change-Id: I8a1d4269465944ac0819986c2f0be4b0a2ee0b35
2010-11-01 16:24:15 -04:00 · 2010-11-01 16:24:15 -04:00 · ff4a71f4c2
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@ -0,0 +1,114 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
+;               short *qcoeff_ptr,short *dequant_ptr,
+;               short *round_ptr,
+;               short *quant_ptr, short *dqcoeff_ptr);
+;
+global sym(vp8_fast_quantize_b_impl_ssse3)
+sym(vp8_fast_quantize_b_impl_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         rdi, arg(3)                 ;round_ptr
+    mov         rsi, arg(4)                 ;quant_ptr
+
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+
+    movdqa      xmm2, [rdi]                 ;round lo
+    movdqa      xmm3, [rdi + 16]            ;round hi
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pabsw       xmm1, xmm1
+    pabsw       xmm5, xmm5
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    pmulhw      xmm1, [rsi]
+    pmulhw      xmm5, [rsi + 16]
+
+    mov         rdi, arg(1)                 ;qcoeff_ptr
+    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rsi, arg(5)                 ;dqcoeff_ptr
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      [rdi], xmm1
+    movdqa      [rdi + 16], xmm5
+
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+
+    pxor        xmm4, xmm4
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5
+
+    pcmpeqw     xmm1, xmm4                  ;non zero mask
+    pcmpeqw     xmm5, xmm4                  ;non zero mask
+    packsswb    xmm1, xmm5
+    pshufb      xmm1, [ GLOBAL(zz_shuf)]
+
+    pmovmskb    edx, xmm1
+
+;    xor         ecx, ecx
+;    mov         eax, -1
+;find_eob_loop:
+;    shr         edx, 1
+;    jc          fq_skip
+;    mov         eax, ecx
+;fq_skip:
+;    inc         ecx
+;    cmp         ecx, 16
+;    jne         find_eob_loop
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax                      ;flip the bits for bsr
+    bsr         eax, edx
+
+    movdqa      [rsi], xmm2                 ;store dqcoeff
+    movdqa      [rsi + 16], xmm3            ;store dqcoeff
+
+    sub         edi, edx                    ;check for all zeros in bit mask
+    sar         edi, 31                     ;0 or -1
+    add         eax, 1
+    and         eax, edi                    ;if the bit mask was all zero,
+                                            ;then eob = 0
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@ -179,6 +179,25 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)

 #endif

+#if HAVE_SSSE3
+int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+{
+    d->eob = vp8_fast_quantize_b_impl_ssse3(
+                    b->coeff,
+                    d->qcoeff,
+                    d->dequant,
+                    b->round,
+                    b->quant,
+                    d->dqcoeff
+               );
+}
+#endif
+
+
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@ -333,6 +352,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
    {
        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
+
    }
 #endif

--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@ -109,6 +109,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm