Bug 1487049 - Update libopus to v1.3-rc-19-g5cbd7d5f; r=jmspeex

Tags: #secure-revision Bug #: 1487049 Differential Revision: https://phabricator.services.mozilla.com/D4722 --HG-- extra : rebase_source : be8730e9af46dae04923be4497132795f58fb2dd
2018-08-30 10:29:43 -04:00 · 2018-08-30 10:29:43 -04:00 · 0647afe16d
--- a/media/libopus/README_MOZILLA
+++ b/media/libopus/README_MOZILLA
@ -8,4 +8,4 @@ files after the copy step.

 The upstream repository is https://git.xiph.org/opus.git

-The git tag/revision used was v1.2.1.
+The git tag/revision used was v1.3-rc-19-g5cbd7d5f.
--- a/media/libopus/celt/arch.h
+++ b/media/libopus/celt/arch.h
@ -56,23 +56,40 @@

 #define CELT_SIG_SCALE 32768.f

-#define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__);
-#ifdef ENABLE_ASSERTIONS
+#define CELT_FATAL(str) celt_fatal(str, __FILE__, __LINE__);
+
+#if defined(ENABLE_ASSERTIONS) || defined(ENABLE_HARDENING)
+#ifdef __GNUC__
+__attribute__((noreturn))
+#endif
+void celt_fatal(const char *str, const char *file, int line);
+
+#if defined(CELT_C) && !defined(OVERRIDE_celt_fatal)
 #include <stdio.h>
 #include <stdlib.h>
 #ifdef __GNUC__
 __attribute__((noreturn))
 #endif
-static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line)
+void celt_fatal(const char *str, const char *file, int line)
 {
   fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
   abort();
 }
-#define celt_assert(cond) {if (!(cond)) {celt_fatal("assertion failed: " #cond);}}
-#define celt_assert2(cond, message) {if (!(cond)) {celt_fatal("assertion failed: " #cond "\n" message);}}
+#endif
+
+#define celt_assert(cond) {if (!(cond)) {CELT_FATAL("assertion failed: " #cond);}}
+#define celt_assert2(cond, message) {if (!(cond)) {CELT_FATAL("assertion failed: " #cond "\n" message);}}
+#define MUST_SUCCEED(call) celt_assert((call) == OPUS_OK)
 #else
 #define celt_assert(cond)
 #define celt_assert2(cond, message)
+#define MUST_SUCCEED(call) do {if((call) != OPUS_OK) {RESTORE_STACK; return OPUS_INTERNAL_ERROR;} } while (0)
+#endif
+
+#if defined(ENABLE_ASSERTIONS)
+#define celt_sig_assert(cond) {if (!(cond)) {CELT_FATAL("signal assertion failed: " #cond);}}
+#else
+#define celt_sig_assert(cond)
 #endif

 #define IMUL32(a,b) ((a)*(b))
@ -107,6 +124,8 @@ typedef opus_val32 celt_sig;
 typedef opus_val16 celt_norm;
 typedef opus_val32 celt_ener;

+#define celt_isnan(x) 0
+
 #define Q15ONE 32767

 #define SIG_SHIFT 12
--- a/media/libopus/celt/arm/celt_fft_ne10.c
+++ b/media/libopus/celt/arm/celt_fft_ne10.c
@ -0,0 +1,173 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_fft_ne10.c
+   @brief ARM Neon optimizations for fft using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include <NE10_dsp.h>
+#include "os_support.h"
+#include "kiss_fft.h"
+#include "stack_alloc.h"
+
+#if !defined(FIXED_POINT)
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
+#else
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
+#endif
+
+#if defined(CUSTOM_MODES)
+
+/* nfft lengths in NE10 that support scaled fft */
+# define NE10_FFTSCALED_SUPPORT_MAX 4
+static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
+   480, 240, 120, 60
+};
+
+int opus_fft_alloc_arm_neon(kiss_fft_state *st)
+{
+   int i;
+   size_t memneeded = sizeof(struct arch_fft_state);
+
+   st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
+   if (!st->arch_fft)
+      return -1;
+
+   for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
+      if(st->nfft == ne10_fft_scaled_support[i])
+         break;
+   }
+   if (i == NE10_FFTSCALED_SUPPORT_MAX) {
+      /* This nfft length (scaled fft) is not supported in NE10 */
+      st->arch_fft->is_supported = 0;
+      st->arch_fft->priv = NULL;
+   }
+   else {
+      st->arch_fft->is_supported = 1;
+      st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
+      if (st->arch_fft->priv == NULL) {
+         return -1;
+      }
+   }
+   return 0;
+}
+
+void opus_fft_free_arm_neon(kiss_fft_state *st)
+{
+   NE10_FFT_CFG_TYPE_T cfg;
+
+   if (!st->arch_fft)
+      return;
+
+   cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
+   if (cfg)
+      NE10_FFT_DESTROY_C2C_TYPE(cfg);
+   opus_free(st->arch_fft);
+}
+#endif
+
+void opus_fft_neon(const kiss_fft_state *st,
+                   const kiss_fft_cpx *fin,
+                   kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_fft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_forward_scaled = 1;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0, 1);
+#endif
+   }
+   RESTORE_STACK;
+}
+
+void opus_ifft_neon(const kiss_fft_state *st,
+                    const kiss_fft_cpx *fin,
+                    kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_ifft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_backward_scaled = 0;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1, 0);
+#endif
+   }
+   RESTORE_STACK;
+}
--- a/media/libopus/celt/arm/celt_mdct_ne10.c
+++ b/media/libopus/celt/arm/celt_mdct_ne10.c
@ -0,0 +1,258 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_mdct_ne10.c
+   @brief ARM Neon optimizations for mdct using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include "kiss_fft.h"
+#include "_kiss_fft_guts.h"
+#include "mdct.h"
+#include "stack_alloc.h"
+
+void clt_mdct_forward_neon(const mdct_lookup *l,
+                           kiss_fft_scalar *in,
+                           kiss_fft_scalar * OPUS_RESTRICT out,
+                           const opus_val16 *window,
+                           int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+
+   SAVE_STACK;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
+
+   /* Consider the input to be composed of four blocks: [a, b, c, d] */
+   /* Window, shuffle, fold */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
+         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+      wp1 = window;
+      wp2 = window+overlap-1;
+      for(;i<N4-((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ = *xp2;
+         *yp++ = *xp1;
+         xp1+=2;
+         xp2-=2;
+      }
+      for(;i<N4;i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
+         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
+         kiss_fft_scalar re, im, yr, yi;
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
+         yc.r = yr;
+         yc.i = yi;
+         f2[i] = yc;
+      }
+   }
+
+   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
+
+   /* Post-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
+      }
+   }
+   RESTORE_STACK;
+}
+
+void clt_mdct_backward_neon(const mdct_lookup *l,
+                            kiss_fft_scalar *in,
+                            kiss_fft_scalar * OPUS_RESTRICT out,
+                            const opus_val16 * OPUS_RESTRICT window,
+                            int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   const kiss_twiddle_scalar *trig;
+   const kiss_fft_state *st = l->kfft[shift];
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+
+   /* Pre-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
+         yp[2*i] = yr;
+         yp[2*i+1] = yi;
+         xp1+=2*stride;
+         xp2-=2*stride;
+      }
+   }
+
+   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
+
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         kiss_twiddle_scalar t0, t1;
+         re = yp0[0];
+         im = yp0[1];
+         t0 = t[i];
+         t1 = t[N4+i];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         re = yp1[0];
+         im = yp1[1];
+         yp0[0] = yr;
+         yp1[1] = yi;
+
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
+         yp0 += 2;
+         yp1 -= 2;
+      }
+   }
+
+   /* Mirror on both sides for TDAC */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const opus_val16 * OPUS_RESTRICT wp1 = window;
+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
+      {
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
+         wp1++;
+         wp2--;
+      }
+   }
+   RESTORE_STACK;
+}
--- a/media/libopus/celt/arm/celt_neon_intr.c
+++ b/media/libopus/celt/arm/celt_neon_intr.c
@ -196,7 +196,7 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
   int i;
   (void)arch;
   celt_assert(max_pitch > 0);
-   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);

   for (i = 0; i < (max_pitch-3); i += 4) {
      xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
--- a/media/libopus/celt/arm/pitch_arm.h
+++ b/media/libopus/celt/arm/pitch_arm.h
@ -90,7 +90,9 @@ extern opus_val32
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
      const opus_val16 *, opus_val32 *, int, int, int);
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch, arch))

 #  elif defined(OPUS_ARM_PRESUME_EDSP) || \
    defined(OPUS_ARM_PRESUME_MEDIA) || \
@ -142,7 +144,9 @@ extern void
      const opus_val16 *, opus_val32 *, int, int, int);

 #  define OVERRIDE_PITCH_XCORR (1)
-#  define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch, arch))

 #  elif defined(OPUS_ARM_PRESUME_NEON_INTR)

--- a/media/libopus/celt/bands.c
+++ b/media/libopus/celt/bands.c
@ -70,10 +70,10 @@ opus_int16 bitexact_cos(opus_int16 x)
   opus_int32 tmp;
   opus_int16 x2;
   tmp = (4096+((opus_int32)(x)*(x)))>>13;
-   celt_assert(tmp<=32767);
+   celt_sig_assert(tmp<=32767);
   x2 = tmp;
   x2 = (32767-x2) + FRAC_MUL16(x2, (-7651 + FRAC_MUL16(x2, (8277 + FRAC_MUL16(-626, x2)))));
-   celt_assert(x2<=32766);
+   celt_sig_assert(x2<=32766);
   return 1+x2;
 }

@ -282,7 +282,7 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas

      N0 = m->eBands[i+1]-m->eBands[i];
      /* depth in 1/8 bits */
-      celt_assert(pulses[i]>=0);
+      celt_sig_assert(pulses[i]>=0);
      depth = celt_udiv(1+pulses[i], (m->eBands[i+1]-m->eBands[i]))>>LM;

 #ifdef FIXED_POINT
@ -478,7 +478,7 @@ static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT
 /* Decide whether we should spread the pulses in the current frame */
 int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
-      int end, int C, int M)
+      int end, int C, int M, const int *spread_weight)
 {
   int i, c, N0;
   int sum = 0, nbBands=0;
@ -519,8 +519,8 @@ int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
         if (i>m->nbEBands-4)
            hf_sum += celt_udiv(32*(tcount[1]+tcount[0]), N);
         tmp = (2*tcount[2] >= N) + (2*tcount[1] >= N) + (2*tcount[0] >= N);
-         sum += tmp*256;
-         nbBands++;
+         sum += tmp*spread_weight[i];
+         nbBands+=spread_weight[i];
      }
   } while (++c<C);

@ -544,7 +544,7 @@ int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
   /*printf("%d %d %d\n", hf_sum, *hf_average, *tapset_decision);*/
   celt_assert(nbBands>0); /* end has to be non-zero */
   celt_assert(sum>=0);
-   sum = celt_udiv(sum, nbBands);
+   sum = celt_udiv((opus_int32)sum<<8, nbBands);
   /* Recursive averaging */
   sum = (sum+*average)>>1;
   *average = sum;
@ -1492,6 +1492,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
      else
         Y = NULL;
      N = M*eBands[i+1]-M*eBands[i];
+      celt_assert(N > 0);
      tell = ec_tell_frac(ec);

      /* Compute how many bits we want to allocate to this band */
@ -1507,7 +1508,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
         b = 0;
      }

-#ifdef ENABLE_UPDATE_DRAFT
+#ifndef DISABLE_UPDATE_DRAFT
      if (resynth && (M*eBands[i]-N >= M*eBands[start] || i==start+1) && (update_lowband || lowband_offset==0))
            lowband_offset = i;
      if (i == start+1)
@ -1541,7 +1542,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
         fold_start = lowband_offset;
         while(M*eBands[--fold_start] > effective_lowband+norm_offset);
         fold_end = lowband_offset-1;
-#ifdef ENABLE_UPDATE_DRAFT
+#ifndef DISABLE_UPDATE_DRAFT
         while(++fold_end < i && M*eBands[fold_end] < effective_lowband+norm_offset+N);
 #else
         while(M*eBands[++fold_end] < effective_lowband+norm_offset+N);
@ -1621,8 +1622,10 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
               ctx = ctx_save;
               OPUS_COPY(X, X_save, N);
               OPUS_COPY(Y, Y_save, N);
+#ifndef DISABLE_UPDATE_DRAFT
               if (i == start+1)
                  special_hybrid_folding(m, norm, norm2, start, M, dual_stereo);
+#endif
               /* Encode and round up. */
               ctx.theta_round = 1;
               x_cm = quant_band_stereo(&ctx, X, Y, N, b, B,
--- a/media/libopus/celt/bands.h
+++ b/media/libopus/celt/bands.h
@ -72,7 +72,7 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,

 int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
-      int end, int C, int M);
+      int end, int C, int M, const int *spread_weight);

 #ifdef MEASURE_NORM_MSE
 void measure_norm_mse(const CELTMode *m, float *X, float *X0, float *bandE, float *bandE0, int M, int N, int C);
--- a/media/libopus/celt/celt.h
+++ b/media/libopus/celt/celt.h
@ -59,9 +59,11 @@ typedef struct {
   float noisiness;
   float activity;
   float music_prob;
-   float vad_prob;
+   float music_prob_min;
+   float music_prob_max;
   int   bandwidth;
   float activity_probability;
+   float max_pitch_ratio;
   /* Store as Q6 char to save space. */
   unsigned char leak_boost[LEAK_BANDS];
 } AnalysisInfo;
@ -207,6 +209,13 @@ static OPUS_INLINE int fromOpus(unsigned char c)

 extern const signed char tf_select_table[4][8];

+#if defined(ENABLE_HARDENING) || defined(ENABLE_ASSERTIONS)
+void validate_celt_decoder(CELTDecoder *st);
+#define VALIDATE_CELT_DECODER(st) validate_celt_decoder(st)
+#else
+#define VALIDATE_CELT_DECODER(st)
+#endif
+
 int resampling_factor(opus_int32 rate);

 void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
--- a/media/libopus/celt/celt_decoder.c
+++ b/media/libopus/celt/celt_decoder.c
@ -51,6 +51,14 @@
 #include "celt_lpc.h"
 #include "vq.h"

+/* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
+   CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
+   current value corresponds to a pitch of 66.67 Hz. */
+#define PLC_PITCH_LAG_MAX (720)
+/* The minimum pitch lag to allow in the pitch-based PLC. This corresponds to a
+   pitch of 480 Hz. */
+#define PLC_PITCH_LAG_MIN (100)
+
 #if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT)
 #define NORM_ALIASING_HACK
 #endif
@ -101,6 +109,38 @@ struct OpusCustomDecoder {
   /* opus_val16 backgroundLogE[], Size = 2*mode->nbEBands */
 };

+#if defined(ENABLE_HARDENING) || defined(ENABLE_ASSERTIONS)
+/* Make basic checks on the CELT state to ensure we don't end
+   up writing all over memory. */
+void validate_celt_decoder(CELTDecoder *st)
+{
+#ifndef CUSTOM_MODES
+   celt_assert(st->mode == opus_custom_mode_create(48000, 960, NULL));
+   celt_assert(st->overlap == 120);
+#endif
+   celt_assert(st->channels == 1 || st->channels == 2);
+   celt_assert(st->stream_channels == 1 || st->stream_channels == 2);
+   celt_assert(st->downsample > 0);
+   celt_assert(st->start == 0 || st->start == 17);
+   celt_assert(st->start < st->end);
+   celt_assert(st->end <= 21);
+#ifdef OPUS_ARCHMASK
+   celt_assert(st->arch >= 0);
+   celt_assert(st->arch <= OPUS_ARCHMASK);
+#endif
+   celt_assert(st->last_pitch_index <= PLC_PITCH_LAG_MAX);
+   celt_assert(st->last_pitch_index >= PLC_PITCH_LAG_MIN || st->last_pitch_index == 0);
+   celt_assert(st->postfilter_period < MAX_PERIOD);
+   celt_assert(st->postfilter_period >= COMBFILTER_MINPERIOD || st->postfilter_period == 0);
+   celt_assert(st->postfilter_period_old < MAX_PERIOD);
+   celt_assert(st->postfilter_period_old >= COMBFILTER_MINPERIOD || st->postfilter_period_old == 0);
+   celt_assert(st->postfilter_tapset <= 2);
+   celt_assert(st->postfilter_tapset >= 0);
+   celt_assert(st->postfilter_tapset_old <= 2);
+   celt_assert(st->postfilter_tapset_old >= 0);
+}
+#endif
+
 int celt_decoder_get_size(int channels)
 {
   const CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
@ -164,7 +204,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_init(CELTDecoder *st, const CELTMod
   st->start = 0;
   st->end = st->mode->effEBands;
   st->signalling = 1;
-#ifdef ENABLE_UPDATE_DRAFT
+#ifndef DISABLE_UPDATE_DRAFT
   st->disable_inv = channels == 1;
 #else
   st->disable_inv = 0;
@ -437,14 +477,6 @@ static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM,
   }
 }

-/* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
-   CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
-   current value corresponds to a pitch of 66.67 Hz. */
-#define PLC_PITCH_LAG_MAX (720)
-/* The minimum pitch lag to allow in the pitch-based PLC. This corresponds to a
-   pitch of 480 Hz. */
-#define PLC_PITCH_LAG_MIN (100)
-
 static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch)
 {
   int pitch_index;
@ -554,6 +586,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)

      celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
   } else {
+      int exc_length;
      /* Pitch-based PLC */
      const opus_val16 *window;
      opus_val16 *exc;
@ -561,6 +594,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
      int pitch_index;
      VARDECL(opus_val32, etmp);
      VARDECL(opus_val16, _exc);
+      VARDECL(opus_val16, fir_tmp);

      if (loss_count == 0)
      {
@ -570,8 +604,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
         fade = QCONST16(.8f,15);
      }

+      /* We want the excitation for 2 pitch periods in order to look for a
+         decaying signal, but we can't get more than MAX_PERIOD. */
+      exc_length = IMIN(2*pitch_index, MAX_PERIOD);
+
      ALLOC(etmp, overlap, opus_val32);
      ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+      ALLOC(fir_tmp, exc_length, opus_val16);
      exc = _exc+LPC_ORDER;
      window = mode->window;
      c=0; do {
@ -581,13 +620,11 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
         celt_sig *buf;
         int extrapolation_offset;
         int extrapolation_len;
-         int exc_length;
         int j;

         buf = decode_mem[c];
-         for (i=0;i<MAX_PERIOD;i++) {
-            exc[i] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD+i], SIG_SHIFT);
-         }
+         for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
+            exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);

         if (loss_count == 0)
         {
@ -631,20 +668,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
         }
 #endif
         }
-         /* We want the excitation for 2 pitch periods in order to look for a
-            decaying signal, but we can't get more than MAX_PERIOD. */
-         exc_length = IMIN(2*pitch_index, MAX_PERIOD);
         /* Initialize the LPC history with the samples just before the start
            of the region for which we're computing the excitation. */
         {
-            for (i=0;i<LPC_ORDER;i++)
-            {
-               exc[MAX_PERIOD-exc_length-LPC_ORDER+i] =
-                     ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-LPC_ORDER+i], SIG_SHIFT);
-            }
-            /* Compute the excitation for exc_length samples before the loss. */
+            /* Compute the excitation for exc_length samples before the loss. We need the copy
+               because celt_fir() cannot filter in-place. */
            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, st->arch);
+                  fir_tmp, exc_length, LPC_ORDER, st->arch);
+            OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length);
         }

         /* Check if the waveform is decaying, and if so how fast.
@ -833,6 +864,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   const opus_int16 *eBands;
   ALLOC_STACK;

+   VALIDATE_CELT_DECODER(st);
   mode = st->mode;
   nbEBands = mode->nbEBands;
   overlap = mode->overlap;
--- a/media/libopus/celt/celt_encoder.c
+++ b/media/libopus/celt/celt_encoder.c
@ -362,6 +362,12 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
      /* Compute harmonic mean discarding the unreliable boundaries
         The data is smooth, so we only take 1/4th of the samples */
      unmask=0;
+      /* We should never see NaNs here. If we find any, then something really bad happened and we better abort
+         before it does any damage later on. If these asserts are disabled (no hardening), then the table
+         lookup a few lines below (id = ...) is likely to crash dur to an out-of-bounds read. DO NOT FIX
+         that crash on NaN since it could result in a worse issue later on. */
+      celt_assert(!celt_isnan(tmp[0]));
+      celt_assert(!celt_isnan(norm));
      for (i=12;i<len2-5;i+=4)
      {
         int id;
@ -577,7 +583,7 @@ static opus_val32 l1_metric(const celt_norm *tmp, int N, int LM, opus_val16 bias

 static int tf_analysis(const CELTMode *m, int len, int isTransient,
      int *tf_res, int lambda, celt_norm *X, int N0, int LM,
-      opus_val16 tf_estimate, int tf_chan)
+      opus_val16 tf_estimate, int tf_chan, int *importance)
 {
   int i;
   VARDECL(int, metric);
@ -660,22 +666,22 @@ static int tf_analysis(const CELTMode *m, int len, int isTransient,
         biasing the decision */
      if (narrow && (metric[i]==0 || metric[i]==-2*LM))
         metric[i]-=1;
-      /*printf("%d ", metric[i]);*/
+      /*printf("%d ", metric[i]/2 + (!isTransient)*LM);*/
   }
   /*printf("\n");*/
   /* Search for the optimal tf resolution, including tf_select */
   tf_select = 0;
   for (sel=0;sel<2;sel++)
   {
-      cost0 = 0;
-      cost1 = isTransient ? 0 : lambda;
+      cost0 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*sel+0]);
+      cost1 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*sel+1]) + (isTransient ? 0 : lambda);
      for (i=1;i<len;i++)
      {
         int curr0, curr1;
         curr0 = IMIN(cost0, cost1 + lambda);
         curr1 = IMIN(cost0 + lambda, cost1);
-         cost0 = curr0 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+0]);
-         cost1 = curr1 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+1]);
+         cost0 = curr0 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+0]);
+         cost1 = curr1 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+1]);
      }
      cost0 = IMIN(cost0, cost1);
      selcost[sel]=cost0;
@ -684,8 +690,8 @@ static int tf_analysis(const CELTMode *m, int len, int isTransient,
    * If tests confirm it's useful for non-transients, we could allow it. */
   if (selcost[1]<selcost[0] && isTransient)
      tf_select=1;
-   cost0 = 0;
-   cost1 = isTransient ? 0 : lambda;
+   cost0 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*tf_select+0]);
+   cost1 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*tf_select+1]) + (isTransient ? 0 : lambda);
   /* Viterbi forward pass */
   for (i=1;i<len;i++)
   {
@ -713,8 +719,8 @@ static int tf_analysis(const CELTMode *m, int len, int isTransient,
         curr1 = from1;
         path1[i]= 1;
      }
-      cost0 = curr0 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+0]);
-      cost1 = curr1 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+1]);
+      cost0 = curr0 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+0]);
+      cost1 = curr1 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+1]);
   }
   tf_res[len-1] = cost0 < cost1 ? 0 : 1;
   /* Viterbi backward pass to check the decisions */
@ -964,7 +970,8 @@ static opus_val16 median_of_3(const opus_val16 *x)
 static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2,
      int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
      int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
-      int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc, AnalysisInfo *analysis)
+      int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc,
+      AnalysisInfo *analysis, int *importance, int *spread_weight)
 {
   int i, c;
   opus_int32 tot_boost=0;
@ -990,6 +997,42 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
      for (i=0;i<end;i++)
         maxDepth = MAX16(maxDepth, bandLogE[c*nbEBands+i]-noise_floor[i]);
   } while (++c<C);
+   {
+      /* Compute a really simple masking model to avoid taking into account completely masked
+         bands when computing the spreading decision. */
+      VARDECL(opus_val16, mask);
+      VARDECL(opus_val16, sig);
+      ALLOC(mask, nbEBands, opus_val16);
+      ALLOC(sig, nbEBands, opus_val16);
+      for (i=0;i<end;i++)
+         mask[i] = bandLogE[i]-noise_floor[i];
+      if (C==2)
+      {
+         for (i=0;i<end;i++)
+            mask[i] = MAX16(mask[i], bandLogE[nbEBands+i]-noise_floor[i]);
+      }
+      OPUS_COPY(sig, mask, end);
+      for (i=1;i<end;i++)
+         mask[i] = MAX16(mask[i], mask[i-1] - QCONST16(2.f, DB_SHIFT));
+      for (i=end-2;i>=0;i--)
+         mask[i] = MAX16(mask[i], mask[i+1] - QCONST16(3.f, DB_SHIFT));
+      for (i=0;i<end;i++)
+      {
+         /* Compute SMR: Mask is never more than 72 dB below the peak and never below the noise floor.*/
+         opus_val16 smr = sig[i]-MAX16(MAX16(0, maxDepth-QCONST16(12.f, DB_SHIFT)), mask[i]);
+         /* Clamp SMR to make sure we're not shifting by something negative or too large. */
+#ifdef FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         int shift = -PSHR32(MAX16(-QCONST16(5.f, DB_SHIFT), MIN16(0, smr)), DB_SHIFT);
+#else
+         int shift = IMIN(5, IMAX(0, -(int)floor(.5f + smr)));
+#endif
+         spread_weight[i] = 32 >> shift;
+      }
+      /*for (i=0;i<end;i++)
+         printf("%d ", spread_weight[i]);
+      printf("\n");*/
+   }
   /* Make sure that dynamic allocation can't make us bust the budget */
   if (effectiveBytes > 50 && LM>=1 && !lfe)
   {
@ -1046,6 +1089,14 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
      }
      for (i=start;i<end;i++)
         follower[i] = MAX16(follower[i], surround_dynalloc[i]);
+      for (i=start;i<end;i++)
+      {
+#ifdef FIXED_POINT
+         importance[i] = PSHR32(13*celt_exp2(MIN16(follower[i], QCONST16(4.f, DB_SHIFT))), 16);
+#else
+         importance[i] = (int)floor(.5f+13*celt_exp2(MIN16(follower[i], QCONST16(4.f, DB_SHIFT))));
+#endif
+      }
      /* For non-transient CBR/CVBR frames, halve the dynalloc contribution */
      if ((!vbr || constrained_vbr)&&!isTransient)
      {
@ -1101,6 +1152,9 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
            tot_boost += boost_bits;
         }
      }
+   } else {
+      for (i=start;i<end;i++)
+         importance[i] = 13;
   }
   *tot_boost_ = tot_boost;
   RESTORE_STACK;
@ -1109,7 +1163,7 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16


 static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, int CC, int N,
-      int prefilter_tapset, int *pitch, opus_val16 *gain, int *qgain, int enabled, int nbAvailableBytes)
+      int prefilter_tapset, int *pitch, opus_val16 *gain, int *qgain, int enabled, int nbAvailableBytes, AnalysisInfo *analysis)
 {
   int c;
   VARDECL(celt_sig, _pre);
@ -1165,7 +1219,12 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem,
      gain1 = 0;
      pitch_index = COMBFILTER_MINPERIOD;
   }
-
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid)
+      gain1 = (opus_val16)(gain1 * analysis->max_pitch_ratio);
+#else
+   (void)analysis;
+#endif
   /* Gain threshold for enabling the prefilter/postfilter */
   pf_threshold = QCONST16(.2f,15);

@ -1362,6 +1421,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
   VARDECL(int, pulses);
   VARDECL(int, cap);
   VARDECL(int, offsets);
+   VARDECL(int, importance);
+   VARDECL(int, spread_weight);
   VARDECL(int, fine_priority);
   VARDECL(int, tf_res);
   VARDECL(unsigned char, collapse_masks);
@ -1414,6 +1475,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
   opus_int32 equiv_rate;
   int hybrid;
   int weak_transient = 0;
+   int enable_tf_analysis;
   VARDECL(opus_val16, surround_dynalloc);
   ALLOC_STACK;

@ -1454,7 +1516,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
      tell0_frac=tell=1;
      nbFilledBytes=0;
   } else {
-      tell0_frac=tell=ec_tell_frac(enc);
+      tell0_frac=ec_tell_frac(enc);
      tell=ec_tell(enc);
      nbFilledBytes=(tell+4)>>3;
   }
@ -1603,7 +1665,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
            && st->complexity >= 5;

      prefilter_tapset = st->tapset_decision;
-      pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes);
+      pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes, &st->analysis);
      if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3)
            && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period))
         pitch_change = 1;
@ -1633,7 +1695,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
      /* Reduces the likelihood of energy instability on fricatives at low bitrate
         in hybrid mode. It seems like we still want to have real transients on vowels
         though (small SILK quantization offset value). */
-      int allow_weak_transients = hybrid && effectiveBytes<15 && st->silk_info.offset >= 100;
+      int allow_weak_transients = hybrid && effectiveBytes<15 && st->silk_info.signalType != 2;
      isTransient = transient_analysis(in, N+overlap, CC,
            &tf_estimate, &tf_chan, allow_weak_transients, &weak_transient);
   }
@ -1662,6 +1724,9 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
   }

   compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
+   /* This should catch any NaN in the CELT input. Since we're not supposed to see any (they're filtered
+      at the Opus layer), just abort. */
+   celt_assert(!celt_isnan(freq[0]) && (C==1 || !celt_isnan(freq[N])));
   if (CC==2&&C==1)
      tf_chan = 0;
   compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
@ -1805,13 +1870,23 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
   /* Band normalisation */
   normalise_bands(mode, freq, X, bandE, effEnd, C, M);

+   enable_tf_analysis = effectiveBytes>=15*C && !hybrid && st->complexity>=2 && !st->lfe;
+
+   ALLOC(offsets, nbEBands, int);
+   ALLOC(importance, nbEBands, int);
+   ALLOC(spread_weight, nbEBands, int);
+
+   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets,
+         st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
+         eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight);
+
   ALLOC(tf_res, nbEBands, int);
   /* Disable variable tf resolution for hybrid and at very low bitrate */
-   if (effectiveBytes>=15*C && !hybrid && st->complexity>=2 && !st->lfe)
+   if (enable_tf_analysis)
   {
      int lambda;
-      lambda = IMAX(5, 1280/effectiveBytes + 2);
-      tf_select = tf_analysis(mode, effEnd, isTransient, tf_res, lambda, X, N, LM, tf_estimate, tf_chan);
+      lambda = IMAX(80, 20480/effectiveBytes + 2);
+      tf_select = tf_analysis(mode, effEnd, isTransient, tf_res, lambda, X, N, LM, tf_estimate, tf_chan, importance);
      for (i=effEnd;i<end;i++)
         tf_res[i] = tf_res[effEnd-1];
   } else if (hybrid && weak_transient)
@ -1822,7 +1897,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
      for (i=0;i<end;i++)
         tf_res[i] = 1;
      tf_select=0;
-   } else if (hybrid && effectiveBytes<15)
+   } else if (hybrid && effectiveBytes<15 && st->silk_info.signalType != 2)
   {
      /* For low bitrate hybrid, we force temporal resolution to 5 ms rather than 2.5 ms. */
      for (i=0;i<end;i++)
@ -1893,7 +1968,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
         {
            st->spread_decision = spreading_decision(mode, X,
                  &st->tonal_average, st->spread_decision, &st->hf_average,
-                  &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M);
+                  &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M, spread_weight);
         }
         /*printf("%d %d\n", st->tapset_decision, st->spread_decision);*/
         /*printf("%f %d %f %d\n\n", st->analysis.tonality, st->spread_decision, st->analysis.tonality_slope, st->tapset_decision);*/
@ -1901,11 +1976,6 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
      ec_enc_icdf(enc, st->spread_decision, spread_icdf, 5);
   }

-   ALLOC(offsets, nbEBands, int);
-
-   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets,
-         st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
-         eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis);
   /* For LFE, everything interesting is in the first band */
   if (st->lfe)
      offsets[0] = IMIN(8, effectiveBytes/3);
--- a/media/libopus/celt/celt_lpc.c
+++ b/media/libopus/celt/celt_lpc.c
@ -99,7 +99,7 @@ void celt_fir_c(
   int i,j;
   VARDECL(opus_val16, rnum);
   SAVE_STACK;
-
+   celt_assert(x != y);
   ALLOC(rnum, ord, opus_val16);
   for(i=0;i<ord;i++)
      rnum[i] = num[ord-i-1];
@ -107,7 +107,7 @@ void celt_fir_c(
   {
      opus_val32 sum[4];
      sum[0] = SHL32(EXTEND32(x[i  ]), SIG_SHIFT);
-      sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT),
+      sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT);
      sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
      sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
      xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
--- a/media/libopus/celt/cwrs.c
+++ b/media/libopus/celt/cwrs.c
@ -482,7 +482,7 @@ static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
      k0=_k;
      q=row[_n];
      if(q>_i){
-        celt_assert(p>q);
+        celt_sig_assert(p>q);
        _k=_n;
        do p=CELT_PVQ_U_ROW[--_k][_n];
        while(p>_i);
--- a/media/libopus/celt/entcode.h
+++ b/media/libopus/celt/entcode.h
@ -122,7 +122,7 @@ opus_uint32 ec_tell_frac(ec_ctx *_this);

 /* Tested exhaustively for all n and for 1<=d<=256 */
 static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) {
-   celt_assert(d>0);
+   celt_sig_assert(d>0);
 #ifdef USE_SMALL_DIV_TABLE
   if (d>256)
      return n/d;
@ -138,7 +138,7 @@ static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) {
 }

 static OPUS_INLINE opus_int32 celt_sudiv(opus_int32 n, opus_int32 d) {
-   celt_assert(d>0);
+   celt_sig_assert(d>0);
 #ifdef USE_SMALL_DIV_TABLE
   if (n<0)
      return -(opus_int32)celt_udiv(-n, d);
--- a/media/libopus/celt/entdec.h
+++ b/media/libopus/celt/entdec.h
@ -85,7 +85,7 @@ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb);
  The bits must have been encoded with ec_enc_uint().
  No call to ec_dec_update() is necessary after this call.
  _ft: The number of integers that can be decoded (one more than the max).
-       This must be at least one, and no more than 2**32-1.
+       This must be at least 2, and no more than 2**32-1.
  Return: The decoded bits.*/
 opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft);

--- a/media/libopus/celt/entenc.h
+++ b/media/libopus/celt/entenc.h
@ -67,7 +67,7 @@ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb);
 /*Encodes a raw unsigned integer in the stream.
  _fl: The integer to encode.
  _ft: The number of integers that can be encoded (one more than the max).
-       This must be at least one, and no more than 2**32-1.*/
+       This must be at least 2, and no more than 2**32-1.*/
 void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft);

 /*Encodes a sequence of raw bits in the stream.
--- a/media/libopus/celt/mathops.c
+++ b/media/libopus/celt/mathops.c
@ -38,7 +38,8 @@
 #include "mathops.h"

 /*Compute floor(sqrt(_val)) with exact arithmetic.
-  This has been tested on all possible 32-bit inputs.*/
+  _val must be greater than 0.
+  This has been tested on all possible 32-bit inputs greater than 0.*/
 unsigned isqrt32(opus_uint32 _val){
  unsigned b;
  unsigned g;
@ -182,7 +183,7 @@ opus_val32 celt_rcp(opus_val32 x)
   int i;
   opus_val16 n;
   opus_val16 r;
-   celt_assert2(x>0, "celt_rcp() only defined for positive values");
+   celt_sig_assert(x>0);
   i = celt_ilog2(x);
   /* n is Q15 with range [0,1). */
   n = VSHR32(x,i-15)-32768;
--- a/media/libopus/celt/mathops.h
+++ b/media/libopus/celt/mathops.h
@ -72,7 +72,7 @@ static OPUS_INLINE float fast_atan2f(float y, float x) {
 #undef cA
 #undef cB
 #undef cC
-#undef cD
+#undef cE
 #endif


@ -179,7 +179,7 @@ static OPUS_INLINE float celt_exp2(float x)
 /** Integer log in base2. Undefined for zero and negative numbers */
 static OPUS_INLINE opus_int16 celt_ilog2(opus_int32 x)
 {
-   celt_assert2(x>0, "celt_ilog2() only defined for strictly positive numbers");
+   celt_sig_assert(x>0);
   return EC_ILOG(x)-1;
 }
 #endif
--- a/media/libopus/celt/pitch.c
+++ b/media/libopus/celt/pitch.c
@ -102,11 +102,9 @@ static void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len,
   }
 }

-static void celt_fir5(const opus_val16 *x,
+static void celt_fir5(opus_val16 *x,
         const opus_val16 *num,
-         opus_val16 *y,
-         int N,
-         opus_val16 *mem)
+         int N)
 {
   int i;
   opus_val16 num0, num1, num2, num3, num4;
@ -116,11 +114,11 @@ static void celt_fir5(const opus_val16 *x,
   num2=num[2];
   num3=num[3];
   num4=num[4];
-   mem0=mem[0];
-   mem1=mem[1];
-   mem2=mem[2];
-   mem3=mem[3];
-   mem4=mem[4];
+   mem0=0;
+   mem1=0;
+   mem2=0;
+   mem3=0;
+   mem4=0;
   for (i=0;i<N;i++)
   {
      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
@ -134,13 +132,8 @@ static void celt_fir5(const opus_val16 *x,
      mem2 = mem1;
      mem1 = mem0;
      mem0 = x[i];
-      y[i] = ROUND16(sum, SIG_SHIFT);
+      x[i] = ROUND16(sum, SIG_SHIFT);
   }
-   mem[0]=mem0;
-   mem[1]=mem1;
-   mem[2]=mem2;
-   mem[3]=mem3;
-   mem[4]=mem4;
 }


@ -150,7 +143,7 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
   int i;
   opus_val32 ac[5];
   opus_val16 tmp=Q15ONE;
-   opus_val16 lpc[4], mem[5]={0,0,0,0,0};
+   opus_val16 lpc[4];
   opus_val16 lpc2[5];
   opus_val16 c1 = QCONST16(.8f,15);
 #ifdef FIXED_POINT
@ -211,7 +204,7 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
   lpc2[2] = lpc[2] + MULT16_16_Q15(c1,lpc[1]);
   lpc2[3] = lpc[3] + MULT16_16_Q15(c1,lpc[2]);
   lpc2[4] = MULT16_16_Q15(c1,lpc[3]);
-   celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
+   celt_fir5(x_lp, lpc2, len>>1);
 }

 /* Pure C implementation. */
@ -256,7 +249,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
   opus_val32 maxcorr=1;
 #endif
   celt_assert(max_pitch>0);
-   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
   for (i=0;i<max_pitch-3;i+=4)
   {
      opus_val32 sum[4]={0,0,0,0};
--- a/media/libopus/celt/quant_bands.c
+++ b/media/libopus/celt/quant_bands.c
@ -457,7 +457,7 @@ void unquant_coarse_energy(const CELTMode *m, int start, int end, opus_val16 *ol
         /* It would be better to express this invariant as a
            test on C at function entry, but that isn't enough
            to make the static analyzer happy. */
-         celt_assert(c<2);
+         celt_sig_assert(c<2);
         tell = ec_tell(dec);
         if(budget-tell>=15)
         {
--- a/media/libopus/celt/vq.c
+++ b/media/libopus/celt/vq.c
@ -230,12 +230,12 @@ opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch)
         pulsesLeft -= iy[j];
      }  while (++j<N);
   }
-   celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
+   celt_sig_assert(pulsesLeft>=0);

   /* This should never happen, but just in case it does (e.g. on silence)
      we fill the first bin with pulses. */
 #ifdef FIXED_POINT_DEBUG
-   celt_assert2(pulsesLeft<=N+3, "Not enough pulses in the quick pass");
+   celt_sig_assert(pulsesLeft<=N+3);
 #endif
   if (pulsesLeft > N+3)
   {
--- a/media/libopus/celt/x86/celt_lpc_sse4_1.c
+++ b/media/libopus/celt/x86/celt_lpc_sse4_1.c
@ -0,0 +1,89 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_sse4_1(const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         int arch)
+{
+    int i,j;
+    VARDECL(opus_val16, rnum);
+
+    __m128i vecNoA;
+    opus_int32 noA ;
+    SAVE_STACK;
+
+   ALLOC(rnum, ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   noA = EXTEND32(1) << SIG_SHIFT >> 1;
+   vecNoA = _mm_set_epi32(noA, noA, noA, noA);
+
+   for (i=0;i<N-3;i+=4)
+   {
+      opus_val32 sums[4] = {0};
+      __m128i vecSum, vecX;
+
+      xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
+
+      vecSum = _mm_loadu_si128((__m128i *)sums);
+      vecSum = _mm_add_epi32(vecSum, vecNoA);
+      vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
+      vecX = OP_CVTEPI16_EPI32_M64(x + i);
+      vecSum = _mm_add_epi32(vecSum, vecX);
+      vecSum = _mm_packs_epi32(vecSum, vecSum);
+      _mm_storel_epi64((__m128i *)(y + i), vecSum);
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<ord;j++)
+         sum = MAC16_16(sum, rnum[j], x[i+j-ord]);
+      y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
+   }
+
+   RESTORE_STACK;
+}
+
+#endif
--- a/media/libopus/celt/x86/vq_sse2.c
+++ b/media/libopus/celt/x86/vq_sse2.c
@ -135,7 +135,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
   }
   X[N] = X[N+1] = X[N+2] = -100;
   y[N] = y[N+1] = y[N+2] = 100;
-   celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
+   celt_sig_assert(pulsesLeft>=0);

   /* This should never happen, but just in case it does (e.g. on silence)
      we fill the first bin with pulses. */
--- a/media/libopus/celt/x86/x86cpu.h
+++ b/media/libopus/celt/x86/x86cpu.h
@ -82,7 +82,9 @@ int opus_select_arch(void);
 (_mm_cvtepi8_epi32(*(__m128i *)(x)))
 #endif

-# if !defined(__OPTIMIZE__)
+/* similar reasoning about the instruction sequence as in the 32-bit macro above,
+ */
+# if defined(__clang__) || !defined(__OPTIMIZE__)
 #  define OP_CVTEPI16_EPI32_M64(x) \
 (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
 # else
--- a/media/libopus/include/opus.h
+++ b/media/libopus/include/opus.h
@ -531,7 +531,7 @@ OPUS_EXPORT int opus_packet_parse(
   const unsigned char *frames[48],
   opus_int16 size[48],
   int *payload_offset
-) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4);
+) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(5);

 /** Gets the bandwidth of an Opus packet.
  * @param [in] data <tt>char*</tt>: Opus packet
--- a/media/libopus/include/opus_defines.h
+++ b/media/libopus/include/opus_defines.h
@ -169,6 +169,9 @@ extern "C" {
 #define OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST 4046
 #define OPUS_GET_PHASE_INVERSION_DISABLED_REQUEST 4047

+/** Defines for the presence of extended APIs. */
+#define OPUS_HAVE_OPUS_PROJECTION_H
+
 /* Macros to trigger compilation errors when the wrong types are provided to a CTL */
 #define __opus_check_int(x) (((void)((x) == (opus_int32)0)), (opus_int32)(x))
 #define __opus_check_int_ptr(ptr) ((ptr) + ((ptr) - (opus_int32*)(ptr)))
--- a/media/libopus/include/opus_projection.h
+++ b/media/libopus/include/opus_projection.h
@ -0,0 +1,568 @@
+/* Copyright (c) 2017 Google Inc.
+   Written by Andrew Allen */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * @file opus_projection.h
+ * @brief Opus projection reference API
+ */
+
+#ifndef OPUS_PROJECTION_H
+#define OPUS_PROJECTION_H
+
+#include "opus_multistream.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond OPUS_INTERNAL_DOC */
+
+/** These are the actual encoder and decoder CTL ID numbers.
+  * They should not be used directly by applications.c
+  * In general, SETs should be even and GETs should be odd.*/
+/**@{*/
+#define OPUS_PROJECTION_GET_DEMIXING_MATRIX_GAIN_REQUEST    6001
+#define OPUS_PROJECTION_GET_DEMIXING_MATRIX_SIZE_REQUEST    6003
+#define OPUS_PROJECTION_GET_DEMIXING_MATRIX_REQUEST         6005
+/**@}*/
+
+
+/** @endcond */
+
+/** @defgroup opus_projection_ctls Projection specific encoder and decoder CTLs
+  *
+  * These are convenience macros that are specific to the
+  * opus_projection_encoder_ctl() and opus_projection_decoder_ctl()
+  * interface.
+  * The CTLs from @ref opus_genericctls, @ref opus_encoderctls,
+  * @ref opus_decoderctls, and @ref opus_multistream_ctls may be applied to a
+  * projection encoder or decoder as well.
+  */
+/**@{*/
+
+/** Gets the gain (in dB. S7.8-format) of the demixing matrix from the encoder.
+  * @param[out] x <tt>opus_int32 *</tt>: Returns the gain (in dB. S7.8-format)
+  *                                      of the demixing matrix.
+  * @hideinitializer
+  */
+#define OPUS_PROJECTION_GET_DEMIXING_MATRIX_GAIN(x) OPUS_PROJECTION_GET_DEMIXING_MATRIX_GAIN_REQUEST, __opus_check_int_ptr(x)
+
+
+/** Gets the size in bytes of the demixing matrix from the encoder.
+  * @param[out] x <tt>opus_int32 *</tt>: Returns the size in bytes of the
+  *                                      demixing matrix.
+  * @hideinitializer
+  */
+#define OPUS_PROJECTION_GET_DEMIXING_MATRIX_SIZE(x) OPUS_PROJECTION_GET_DEMIXING_MATRIX_SIZE_REQUEST, __opus_check_int_ptr(x)
+
+
+/** Copies the demixing matrix to the supplied pointer location.
+  * @param[out] x <tt>unsigned char *</tt>: Returns the demixing matrix to the
+  *                                         supplied pointer location.
+  * @param y <tt>opus_int32</tt>: The size in bytes of the reserved memory at the
+  *                              pointer location.
+  * @hideinitializer
+  */
+#define OPUS_PROJECTION_GET_DEMIXING_MATRIX(x,y) OPUS_PROJECTION_GET_DEMIXING_MATRIX_REQUEST, x, __opus_check_int(y)
+
+
+/**@}*/
+
+/** Opus projection encoder state.
+ * This contains the complete state of a projection Opus encoder.
+ * It is position independent and can be freely copied.
+ * @see opus_projection_ambisonics_encoder_create
+ */
+typedef struct OpusProjectionEncoder OpusProjectionEncoder;
+
+
+/** Opus projection decoder state.
+  * This contains the complete state of a projection Opus decoder.
+  * It is position independent and can be freely copied.
+  * @see opus_projection_decoder_create
+  * @see opus_projection_decoder_init
+  */
+typedef struct OpusProjectionDecoder OpusProjectionDecoder;
+
+
+/**\name Projection encoder functions */
+/**@{*/
+
+/** Gets the size of an OpusProjectionEncoder structure.
+  * @param channels <tt>int</tt>: The total number of input channels to encode.
+  *                               This must be no more than 255.
+  * @param mapping_family <tt>int</tt>: The mapping family to use for selecting
+  *                                     the appropriate projection.
+  * @returns The size in bytes on success, or a negative error code
+  *          (see @ref opus_errorcodes) on error.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_projection_ambisonics_encoder_get_size(
+    int channels,
+    int mapping_family
+);
+
+
+/** Allocates and initializes a projection encoder state.
+  * Call opus_projection_encoder_destroy() to release
+  * this object when finished.
+  * @param Fs <tt>opus_int32</tt>: Sampling rate of the input signal (in Hz).
+  *                                This must be one of 8000, 12000, 16000,
+  *                                24000, or 48000.
+  * @param channels <tt>int</tt>: Number of channels in the input signal.
+  *                               This must be at most 255.
+  *                               It may be greater than the number of
+  *                               coded channels (<code>streams +
+  *                               coupled_streams</code>).
+  * @param mapping_family <tt>int</tt>: The mapping family to use for selecting
+  *                                     the appropriate projection.
+  * @param[out] streams <tt>int *</tt>: The total number of streams that will
+  *                                     be encoded from the input.
+  * @param[out] coupled_streams <tt>int *</tt>: Number of coupled (2 channel)
+  *                                 streams that will be encoded from the input.
+  * @param application <tt>int</tt>: The target encoder application.
+  *                                  This must be one of the following:
+  * <dl>
+  * <dt>#OPUS_APPLICATION_VOIP</dt>
+  * <dd>Process signal for improved speech intelligibility.</dd>
+  * <dt>#OPUS_APPLICATION_AUDIO</dt>
+  * <dd>Favor faithfulness to the original input.</dd>
+  * <dt>#OPUS_APPLICATION_RESTRICTED_LOWDELAY</dt>
+  * <dd>Configure the minimum possible coding delay by disabling certain modes
+  * of operation.</dd>
+  * </dl>
+  * @param[out] error <tt>int *</tt>: Returns #OPUS_OK on success, or an error
+  *                                   code (see @ref opus_errorcodes) on
+  *                                   failure.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusProjectionEncoder *opus_projection_ambisonics_encoder_create(
+    opus_int32 Fs,
+    int channels,
+    int mapping_family,
+    int *streams,
+    int *coupled_streams,
+    int application,
+    int *error
+) OPUS_ARG_NONNULL(4) OPUS_ARG_NONNULL(5);
+
+
+/** Initialize a previously allocated projection encoder state.
+  * The memory pointed to by \a st must be at least the size returned by
+  * opus_projection_ambisonics_encoder_get_size().
+  * This is intended for applications which use their own allocator instead of
+  * malloc.
+  * To reset a previously initialized state, use the #OPUS_RESET_STATE CTL.
+  * @see opus_projection_ambisonics_encoder_create
+  * @see opus_projection_ambisonics_encoder_get_size
+  * @param st <tt>OpusProjectionEncoder*</tt>: Projection encoder state to initialize.
+  * @param Fs <tt>opus_int32</tt>: Sampling rate of the input signal (in Hz).
+  *                                This must be one of 8000, 12000, 16000,
+  *                                24000, or 48000.
+  * @param channels <tt>int</tt>: Number of channels in the input signal.
+  *                               This must be at most 255.
+  *                               It may be greater than the number of
+  *                               coded channels (<code>streams +
+  *                               coupled_streams</code>).
+  * @param streams <tt>int</tt>: The total number of streams to encode from the
+  *                              input.
+  *                              This must be no more than the number of channels.
+  * @param coupled_streams <tt>int</tt>: Number of coupled (2 channel) streams
+  *                                      to encode.
+  *                                      This must be no larger than the total
+  *                                      number of streams.
+  *                                      Additionally, The total number of
+  *                                      encoded channels (<code>streams +
+  *                                      coupled_streams</code>) must be no
+  *                                      more than the number of input channels.
+  * @param application <tt>int</tt>: The target encoder application.
+  *                                  This must be one of the following:
+  * <dl>
+  * <dt>#OPUS_APPLICATION_VOIP</dt>
+  * <dd>Process signal for improved speech intelligibility.</dd>
+  * <dt>#OPUS_APPLICATION_AUDIO</dt>
+  * <dd>Favor faithfulness to the original input.</dd>
+  * <dt>#OPUS_APPLICATION_RESTRICTED_LOWDELAY</dt>
+  * <dd>Configure the minimum possible coding delay by disabling certain modes
+  * of operation.</dd>
+  * </dl>
+  * @returns #OPUS_OK on success, or an error code (see @ref opus_errorcodes)
+  *          on failure.
+  */
+OPUS_EXPORT int opus_projection_ambisonics_encoder_init(
+    OpusProjectionEncoder *st,
+    opus_int32 Fs,
+    int channels,
+    int mapping_family,
+    int *streams,
+    int *coupled_streams,
+    int application
+) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(5) OPUS_ARG_NONNULL(6);
+
+
+/** Encodes a projection Opus frame.
+  * @param st <tt>OpusProjectionEncoder*</tt>: Projection encoder state.
+  * @param[in] pcm <tt>const opus_int16*</tt>: The input signal as interleaved
+  *                                            samples.
+  *                                            This must contain
+  *                                            <code>frame_size*channels</code>
+  *                                            samples.
+  * @param frame_size <tt>int</tt>: Number of samples per channel in the input
+  *                                 signal.
+  *                                 This must be an Opus frame size for the
+  *                                 encoder's sampling rate.
+  *                                 For example, at 48 kHz the permitted values
+  *                                 are 120, 240, 480, 960, 1920, and 2880.
+  *                                 Passing in a duration of less than 10 ms
+  *                                 (480 samples at 48 kHz) will prevent the
+  *                                 encoder from using the LPC or hybrid modes.
+  * @param[out] data <tt>unsigned char*</tt>: Output payload.
+  *                                           This must contain storage for at
+  *                                           least \a max_data_bytes.
+  * @param [in] max_data_bytes <tt>opus_int32</tt>: Size of the allocated
+  *                                                 memory for the output
+  *                                                 payload. This may be
+  *                                                 used to impose an upper limit on
+  *                                                 the instant bitrate, but should
+  *                                                 not be used as the only bitrate
+  *                                                 control. Use #OPUS_SET_BITRATE to
+  *                                                 control the bitrate.
+  * @returns The length of the encoded packet (in bytes) on success or a
+  *          negative error code (see @ref opus_errorcodes) on failure.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_projection_encode(
+    OpusProjectionEncoder *st,
+    const opus_int16 *pcm,
+    int frame_size,
+    unsigned char *data,
+    opus_int32 max_data_bytes
+) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4);
+
+
+/** Encodes a projection Opus frame from floating point input.
+  * @param st <tt>OpusProjectionEncoder*</tt>: Projection encoder state.
+  * @param[in] pcm <tt>const float*</tt>: The input signal as interleaved
+  *                                       samples with a normal range of
+  *                                       +/-1.0.
+  *                                       Samples with a range beyond +/-1.0
+  *                                       are supported but will be clipped by
+  *                                       decoders using the integer API and
+  *                                       should only be used if it is known
+  *                                       that the far end supports extended
+  *                                       dynamic range.
+  *                                       This must contain
+  *                                       <code>frame_size*channels</code>
+  *                                       samples.
+  * @param frame_size <tt>int</tt>: Number of samples per channel in the input
+  *                                 signal.
+  *                                 This must be an Opus frame size for the
+  *                                 encoder's sampling rate.
+  *                                 For example, at 48 kHz the permitted values
+  *                                 are 120, 240, 480, 960, 1920, and 2880.
+  *                                 Passing in a duration of less than 10 ms
+  *                                 (480 samples at 48 kHz) will prevent the
+  *                                 encoder from using the LPC or hybrid modes.
+  * @param[out] data <tt>unsigned char*</tt>: Output payload.
+  *                                           This must contain storage for at
+  *                                           least \a max_data_bytes.
+  * @param [in] max_data_bytes <tt>opus_int32</tt>: Size of the allocated
+  *                                                 memory for the output
+  *                                                 payload. This may be
+  *                                                 used to impose an upper limit on
+  *                                                 the instant bitrate, but should
+  *                                                 not be used as the only bitrate
+  *                                                 control. Use #OPUS_SET_BITRATE to
+  *                                                 control the bitrate.
+  * @returns The length of the encoded packet (in bytes) on success or a
+  *          negative error code (see @ref opus_errorcodes) on failure.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_projection_encode_float(
+    OpusProjectionEncoder *st,
+    const float *pcm,
+    int frame_size,
+    unsigned char *data,
+    opus_int32 max_data_bytes
+) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4);
+
+
+/** Frees an <code>OpusProjectionEncoder</code> allocated by
+  * opus_projection_ambisonics_encoder_create().
+  * @param st <tt>OpusProjectionEncoder*</tt>: Projection encoder state to be freed.
+  */
+OPUS_EXPORT void opus_projection_encoder_destroy(OpusProjectionEncoder *st);
+
+
+/** Perform a CTL function on a projection Opus encoder.
+  *
+  * Generally the request and subsequent arguments are generated by a
+  * convenience macro.
+  * @param st <tt>OpusProjectionEncoder*</tt>: Projection encoder state.
+  * @param request This and all remaining parameters should be replaced by one
+  *                of the convenience macros in @ref opus_genericctls,
+  *                @ref opus_encoderctls, @ref opus_multistream_ctls, or
+  *                @ref opus_projection_ctls
+  * @see opus_genericctls
+  * @see opus_encoderctls
+  * @see opus_multistream_ctls
+  * @see opus_projection_ctls
+  */
+OPUS_EXPORT int opus_projection_encoder_ctl(OpusProjectionEncoder *st, int request, ...) OPUS_ARG_NONNULL(1);
+
+
+/**@}*/
+
+/**\name Projection decoder functions */
+/**@{*/
+
+/** Gets the size of an <code>OpusProjectionDecoder</code> structure.
+  * @param channels <tt>int</tt>: The total number of output channels.
+  *                               This must be no more than 255.
+  * @param streams <tt>int</tt>: The total number of streams coded in the
+  *                              input.
+  *                              This must be no more than 255.
+  * @param coupled_streams <tt>int</tt>: Number streams to decode as coupled
+  *                                      (2 channel) streams.
+  *                                      This must be no larger than the total
+  *                                      number of streams.
+  *                                      Additionally, The total number of
+  *                                      coded channels (<code>streams +
+  *                                      coupled_streams</code>) must be no
+  *                                      more than 255.
+  * @returns The size in bytes on success, or a negative error code
+  *          (see @ref opus_errorcodes) on error.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_projection_decoder_get_size(
+    int channels,
+    int streams,
+    int coupled_streams
+);
+
+
+/** Allocates and initializes a projection decoder state.
+  * Call opus_projection_decoder_destroy() to release
+  * this object when finished.
+  * @param Fs <tt>opus_int32</tt>: Sampling rate to decode at (in Hz).
+  *                                This must be one of 8000, 12000, 16000,
+  *                                24000, or 48000.
+  * @param channels <tt>int</tt>: Number of channels to output.
+  *                               This must be at most 255.
+  *                               It may be different from the number of coded
+  *                               channels (<code>streams +
+  *                               coupled_streams</code>).
+  * @param streams <tt>int</tt>: The total number of streams coded in the
+  *                              input.
+  *                              This must be no more than 255.
+  * @param coupled_streams <tt>int</tt>: Number of streams to decode as coupled
+  *                                      (2 channel) streams.
+  *                                      This must be no larger than the total
+  *                                      number of streams.
+  *                                      Additionally, The total number of
+  *                                      coded channels (<code>streams +
+  *                                      coupled_streams</code>) must be no
+  *                                      more than 255.
+  * @param[in] demixing_matrix <tt>const unsigned char[demixing_matrix_size]</tt>: Demixing matrix
+  *                         that mapping from coded channels to output channels,
+  *                         as described in @ref opus_projection and
+  *                         @ref opus_projection_ctls.
+  * @param demixing_matrix_size <tt>opus_int32</tt>: The size in bytes of the
+  *                                                  demixing matrix, as
+  *                                                  described in @ref
+  *                                                  opus_projection_ctls.
+  * @param[out] error <tt>int *</tt>: Returns #OPUS_OK on success, or an error
+  *                                   code (see @ref opus_errorcodes) on
+  *                                   failure.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusProjectionDecoder *opus_projection_decoder_create(
+    opus_int32 Fs,
+    int channels,
+    int streams,
+    int coupled_streams,
+    unsigned char *demixing_matrix,
+    opus_int32 demixing_matrix_size,
+    int *error
+) OPUS_ARG_NONNULL(5);
+
+
+/** Intialize a previously allocated projection decoder state object.
+  * The memory pointed to by \a st must be at least the size returned by
+  * opus_projection_decoder_get_size().
+  * This is intended for applications which use their own allocator instead of
+  * malloc.
+  * To reset a previously initialized state, use the #OPUS_RESET_STATE CTL.
+  * @see opus_projection_decoder_create
+  * @see opus_projection_deocder_get_size
+  * @param st <tt>OpusProjectionDecoder*</tt>: Projection encoder state to initialize.
+  * @param Fs <tt>opus_int32</tt>: Sampling rate to decode at (in Hz).
+  *                                This must be one of 8000, 12000, 16000,
+  *                                24000, or 48000.
+  * @param channels <tt>int</tt>: Number of channels to output.
+  *                               This must be at most 255.
+  *                               It may be different from the number of coded
+  *                               channels (<code>streams +
+  *                               coupled_streams</code>).
+  * @param streams <tt>int</tt>: The total number of streams coded in the
+  *                              input.
+  *                              This must be no more than 255.
+  * @param coupled_streams <tt>int</tt>: Number of streams to decode as coupled
+  *                                      (2 channel) streams.
+  *                                      This must be no larger than the total
+  *                                      number of streams.
+  *                                      Additionally, The total number of
+  *                                      coded channels (<code>streams +
+  *                                      coupled_streams</code>) must be no
+  *                                      more than 255.
+  * @param[in] demixing_matrix <tt>const unsigned char[demixing_matrix_size]</tt>: Demixing matrix
+  *                         that mapping from coded channels to output channels,
+  *                         as described in @ref opus_projection and
+  *                         @ref opus_projection_ctls.
+  * @param demixing_matrix_size <tt>opus_int32</tt>: The size in bytes of the
+  *                                                  demixing matrix, as
+  *                                                  described in @ref
+  *                                                  opus_projection_ctls.
+  * @returns #OPUS_OK on success, or an error code (see @ref opus_errorcodes)
+  *          on failure.
+  */
+OPUS_EXPORT int opus_projection_decoder_init(
+    OpusProjectionDecoder *st,
+    opus_int32 Fs,
+    int channels,
+    int streams,
+    int coupled_streams,
+    unsigned char *demixing_matrix,
+    opus_int32 demixing_matrix_size
+) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(6);
+
+
+/** Decode a projection Opus packet.
+  * @param st <tt>OpusProjectionDecoder*</tt>: Projection decoder state.
+  * @param[in] data <tt>const unsigned char*</tt>: Input payload.
+  *                                                Use a <code>NULL</code>
+  *                                                pointer to indicate packet
+  *                                                loss.
+  * @param len <tt>opus_int32</tt>: Number of bytes in payload.
+  * @param[out] pcm <tt>opus_int16*</tt>: Output signal, with interleaved
+  *                                       samples.
+  *                                       This must contain room for
+  *                                       <code>frame_size*channels</code>
+  *                                       samples.
+  * @param frame_size <tt>int</tt>: The number of samples per channel of
+  *                                 available space in \a pcm.
+  *                                 If this is less than the maximum packet duration
+  *                                 (120 ms; 5760 for 48kHz), this function will not be capable
+  *                                 of decoding some packets. In the case of PLC (data==NULL)
+  *                                 or FEC (decode_fec=1), then frame_size needs to be exactly
+  *                                 the duration of audio that is missing, otherwise the
+  *                                 decoder will not be in the optimal state to decode the
+  *                                 next incoming packet. For the PLC and FEC cases, frame_size
+  *                                 <b>must</b> be a multiple of 2.5 ms.
+  * @param decode_fec <tt>int</tt>: Flag (0 or 1) to request that any in-band
+  *                                 forward error correction data be decoded.
+  *                                 If no such data is available, the frame is
+  *                                 decoded as if it were lost.
+  * @returns Number of samples decoded on success or a negative error code
+  *          (see @ref opus_errorcodes) on failure.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_projection_decode(
+    OpusProjectionDecoder *st,
+    const unsigned char *data,
+    opus_int32 len,
+    opus_int16 *pcm,
+    int frame_size,
+    int decode_fec
+) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4);
+
+
+/** Decode a projection Opus packet with floating point output.
+  * @param st <tt>OpusProjectionDecoder*</tt>: Projection decoder state.
+  * @param[in] data <tt>const unsigned char*</tt>: Input payload.
+  *                                                Use a <code>NULL</code>
+  *                                                pointer to indicate packet
+  *                                                loss.
+  * @param len <tt>opus_int32</tt>: Number of bytes in payload.
+  * @param[out] pcm <tt>opus_int16*</tt>: Output signal, with interleaved
+  *                                       samples.
+  *                                       This must contain room for
+  *                                       <code>frame_size*channels</code>
+  *                                       samples.
+  * @param frame_size <tt>int</tt>: The number of samples per channel of
+  *                                 available space in \a pcm.
+  *                                 If this is less than the maximum packet duration
+  *                                 (120 ms; 5760 for 48kHz), this function will not be capable
+  *                                 of decoding some packets. In the case of PLC (data==NULL)
+  *                                 or FEC (decode_fec=1), then frame_size needs to be exactly
+  *                                 the duration of audio that is missing, otherwise the
+  *                                 decoder will not be in the optimal state to decode the
+  *                                 next incoming packet. For the PLC and FEC cases, frame_size
+  *                                 <b>must</b> be a multiple of 2.5 ms.
+  * @param decode_fec <tt>int</tt>: Flag (0 or 1) to request that any in-band
+  *                                 forward error correction data be decoded.
+  *                                 If no such data is available, the frame is
+  *                                 decoded as if it were lost.
+  * @returns Number of samples decoded on success or a negative error code
+  *          (see @ref opus_errorcodes) on failure.
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_projection_decode_float(
+    OpusProjectionDecoder *st,
+    const unsigned char *data,
+    opus_int32 len,
+    float *pcm,
+    int frame_size,
+    int decode_fec
+) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4);
+
+
+/** Perform a CTL function on a projection Opus decoder.
+  *
+  * Generally the request and subsequent arguments are generated by a
+  * convenience macro.
+  * @param st <tt>OpusProjectionDecoder*</tt>: Projection decoder state.
+  * @param request This and all remaining parameters should be replaced by one
+  *                of the convenience macros in @ref opus_genericctls,
+  *                @ref opus_decoderctls, @ref opus_multistream_ctls, or
+  *                @ref opus_projection_ctls.
+  * @see opus_genericctls
+  * @see opus_decoderctls
+  * @see opus_multistream_ctls
+  * @see opus_projection_ctls
+  */
+OPUS_EXPORT int opus_projection_decoder_ctl(OpusProjectionDecoder *st, int request, ...) OPUS_ARG_NONNULL(1);
+
+
+/** Frees an <code>OpusProjectionDecoder</code> allocated by
+  * opus_projection_decoder_create().
+  * @param st <tt>OpusProjectionDecoder</tt>: Projection decoder state to be freed.
+  */
+OPUS_EXPORT void opus_projection_decoder_destroy(OpusProjectionDecoder *st);
+
+
+/**@}*/
+
+/**@}*/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPUS_PROJECTION_H */
--- a/media/libopus/include/opus_types.h
+++ b/media/libopus/include/opus_types.h
@ -33,14 +33,29 @@
 #ifndef OPUS_TYPES_H
 #define OPUS_TYPES_H

+#define opus_int         int                     /* used for counters etc; at least 16 bits */
+#define opus_int64       long long
+#define opus_int8        signed char
+
+#define opus_uint        unsigned int            /* used for counters etc; at least 16 bits */
+#define opus_uint64      unsigned long long
+#define opus_uint8       unsigned char
+
 /* Use the real stdint.h if it's there (taken from Paul Hsieh's pstdint.h) */
 #if (defined(__STDC__) && __STDC__ && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_)) || defined (HAVE_STDINT_H))
 #include <stdint.h>
-
+#  undef opus_int64
+#  undef opus_int8
+#  undef opus_uint64
+#  undef opus_uint8
+   typedef int8_t opus_int8;
+   typedef uint8_t opus_uint8;
   typedef int16_t opus_int16;
   typedef uint16_t opus_uint16;
   typedef int32_t opus_int32;
   typedef uint32_t opus_uint32;
+   typedef int64_t opus_int64;
+   typedef uint64_t opus_uint64;
 #elif defined(_WIN32)

 #  if defined(__CYGWIN__)
@ -148,12 +163,4 @@

 #endif

-#define opus_int         int                     /* used for counters etc; at least 16 bits */
-#define opus_int64       long long
-#define opus_int8        signed char
-
-#define opus_uint        unsigned int            /* used for counters etc; at least 16 bits */
-#define opus_uint64      unsigned long long
-#define opus_uint8       unsigned char
-
 #endif  /* OPUS_TYPES_H */
--- a/media/libopus/moz.build
+++ b/media/libopus/moz.build
@ -20,7 +20,7 @@ AllowCompilerWarnings()
 FINAL_LIBRARY = 'gkmedias'

 DEFINES['OPUS_BUILD'] = True
-DEFINES['OPUS_VERSION'] = '"v1.2.1-mozilla"'
+DEFINES['OPUS_VERSION'] = '"v1.3-rc-19-g5cbd7d5f-mozilla"'
 DEFINES['USE_ALLOCA'] = True

 # Don't export symbols
--- a/media/libopus/silk/API.h
+++ b/media/libopus/silk/API.h
@ -80,7 +80,8 @@ opus_int silk_Encode(                                   /* O    Returns error co
    opus_int                        nSamplesIn,         /* I    Number of samples in input vector               */
    ec_enc                          *psRangeEnc,        /* I/O  Compressor data structure                       */
    opus_int32                      *nBytesOut,         /* I/O  Number of bytes in payload (input: Max bytes)   */
-    const opus_int                  prefillFlag         /* I    Flag to indicate prefilling buffers no coding   */
+    const opus_int                  prefillFlag,        /* I    Flag to indicate prefilling buffers no coding   */
+    int                             activity            /* I    Decision of Opus voice activity detector        */
 );

 /****************************************/
--- a/media/libopus/silk/CNG.c
+++ b/media/libopus/silk/CNG.c
@ -146,8 +146,8 @@ void silk_CNG(

        /* Generate CNG signal, by synthesis filtering */
        silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) );
+        celt_assert( psDec->LPC_order == 10 || psDec->LPC_order == 16 );
        for( i = 0; i < length; i++ ) {
-            silk_assert( psDec->LPC_order == 10 || psDec->LPC_order == 16 );
            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
            LPC_pred_Q10 = silk_RSHIFT( psDec->LPC_order, 1 );
            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i -  1 ], A_Q12[ 0 ] );
--- a/media/libopus/silk/LPC_analysis_filter.c
+++ b/media/libopus/silk/LPC_analysis_filter.c
@ -64,12 +64,12 @@ void silk_LPC_analysis_filter(
    const opus_int16 *in_ptr;
 #endif

-    silk_assert( d >= 6 );
-    silk_assert( (d & 1) == 0 );
-    silk_assert( d <= len );
+    celt_assert( d >= 6 );
+    celt_assert( (d & 1) == 0 );
+    celt_assert( d <= len );

 #if defined(FIXED_POINT) && USE_CELT_FIR
-    silk_assert( d <= SILK_MAX_ORDER_LPC );
+    celt_assert( d <= SILK_MAX_ORDER_LPC );
    for ( j = 0; j < d; j++ ) {
        num[ j ] = -B[ j ];
    }
--- a/media/libopus/silk/NLSF2A.c
+++ b/media/libopus/silk/NLSF2A.c
@ -86,7 +86,7 @@ void silk_NLSF2A(
    opus_int32 a32_QA1[ SILK_MAX_ORDER_LPC ];

    silk_assert( LSF_COS_TAB_SZ_FIX == 128 );
-    silk_assert( d==10 || d==16 );
+    celt_assert( d==10 || d==16 );

    /* convert LSFs to 2*cos(LSF), using piecewise linear curve from table */
    ordering = d == 16 ? ordering16 : ordering10;
--- a/media/libopus/silk/NLSF_VQ.c
+++ b/media/libopus/silk/NLSF_VQ.c
@ -46,7 +46,7 @@ void silk_NLSF_VQ(
    const opus_int16 *w_Q9_ptr;
    const opus_uint8 *cb_Q8_ptr;

-    silk_assert( ( LPC_order & 1 ) == 0 );
+    celt_assert( ( LPC_order & 1 ) == 0 );

    /* Loop over codebook */
    cb_Q8_ptr = pCB_Q8;
--- a/media/libopus/silk/NLSF_VQ_weights_laroia.c
+++ b/media/libopus/silk/NLSF_VQ_weights_laroia.c
@ -48,8 +48,8 @@ void silk_NLSF_VQ_weights_laroia(
    opus_int   k;
    opus_int32 tmp1_int, tmp2_int;

-    silk_assert( D > 0 );
-    silk_assert( ( D & 1 ) == 0 );
+    celt_assert( D > 0 );
+    celt_assert( ( D & 1 ) == 0 );

    /* First value */
    tmp1_int = silk_max_int( pNLSF_Q15[ 0 ], 1 );
--- a/media/libopus/silk/NLSF_encode.c
+++ b/media/libopus/silk/NLSF_encode.c
@ -60,7 +60,7 @@ opus_int32 silk_NLSF_encode(                                    /* O    Returns
    const opus_int16 *pCB_Wght_Q9;
    SAVE_STACK;

-    silk_assert( signalType >= 0 && signalType <= 2 );
+    celt_assert( signalType >= 0 && signalType <= 2 );
    silk_assert( NLSF_mu_Q20 <= 32767 && NLSF_mu_Q20 >= 0 );

    /* NLSF stabilization */
--- a/media/libopus/silk/NSQ.c
+++ b/media/libopus/silk/NSQ.c
@ -143,7 +143,7 @@ void silk_NSQ_c
            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
                /* Rewhiten with new A coefs */
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
-                silk_assert( start_idx > 0 );
+                celt_assert( start_idx > 0 );

                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
@ -247,7 +247,7 @@ void silk_noise_shape_quantizer(
        }

        /* Noise shape feedback */
-        silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+        celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
        n_AR_Q12 = silk_NSQ_noise_shape_feedback_loop(&NSQ->sDiff_shp_Q14, NSQ->sAR2_Q14, AR_shp_Q13, shapingLPCOrder, arch);

        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sLF_AR_shp_Q14, Tilt_Q14 );
@ -255,7 +255,7 @@ void silk_noise_shape_quantizer(
        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, NSQ->sLF_AR_shp_Q14, LF_shp_Q14 );

-        silk_assert( lag > 0 || signalType != TYPE_VOICED );
+        celt_assert( lag > 0 || signalType != TYPE_VOICED );

        /* Combine prediction and noise shaping signals */
        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
--- a/media/libopus/silk/NSQ_del_dec.c
+++ b/media/libopus/silk/NSQ_del_dec.c
@ -250,7 +250,7 @@ void silk_NSQ_del_dec_c(

                /* Rewhiten with new A coefs */
                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
-                silk_assert( start_idx > 0 );
+                celt_assert( start_idx > 0 );

                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
@ -361,7 +361,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
    NSQ_sample_struct  *psSS;
    SAVE_STACK;

-    silk_assert( nStatesDelayedDecision > 0 );
+    celt_assert( nStatesDelayedDecision > 0 );
    ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );

    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
@ -419,7 +419,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
            LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 );                              /* Q10 -> Q14 */

            /* Noise shape feedback */
-            silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+            celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
            /* Output of lowpass section */
            tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
            /* Output of allpass section */
--- a/media/libopus/silk/PLC.c
+++ b/media/libopus/silk/PLC.c
@ -291,7 +291,7 @@ static OPUS_INLINE void silk_PLC_conceal(

    /* Rewhiten LTP state */
    idx = psDec->ltp_mem_length - lag - psDec->LPC_order - LTP_ORDER / 2;
-    silk_assert( idx > 0 );
+    celt_assert( idx > 0 );
    silk_LPC_analysis_filter( &sLTP[ idx ], &psDec->outBuf[ idx ], A_Q12, psDec->ltp_mem_length - idx, psDec->LPC_order, arch );
    /* Scale LTP state */
    inv_gain_Q30 = silk_INVERSE32_varQ( psPLC->prevGain_Q16[ 1 ], 46 );
@ -347,7 +347,7 @@ static OPUS_INLINE void silk_PLC_conceal(
    /* Copy LPC state */
    silk_memcpy( sLPC_Q14_ptr, psDec->sLPC_Q14_buf, MAX_LPC_ORDER * sizeof( opus_int32 ) );

-    silk_assert( psDec->LPC_order >= 10 ); /* check that unrolling works */
+    celt_assert( psDec->LPC_order >= 10 ); /* check that unrolling works */
    for( i = 0; i < psDec->frame_length; i++ ) {
        /* partly unrolled */
        /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
--- a/media/libopus/silk/VAD.c
+++ b/media/libopus/silk/VAD.c
@ -101,9 +101,9 @@ opus_int silk_VAD_GetSA_Q8_c(                                   /* O    Return v

    /* Safety checks */
    silk_assert( VAD_N_BANDS == 4 );
-    silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
-    silk_assert( psEncC->frame_length <= 512 );
-    silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
+    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
+    celt_assert( psEncC->frame_length <= 512 );
+    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );

    /***********************/
    /* Filter and Decimate */
@ -252,15 +252,14 @@ opus_int silk_VAD_GetSA_Q8_c(                                   /* O    Return v
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
    }

+    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
+        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
+    }
    /* Power scaling */
    if( speech_nrg <= 0 ) {
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
-    } else if( speech_nrg < 32768 ) {
-        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
-        } else {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
-        }
+    } else if( speech_nrg < 16384 ) {
+        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );

        /* square-root */
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
--- a/media/libopus/silk/arm/LPC_inv_pred_gain_neon_intr.c
+++ b/media/libopus/silk/arm/LPC_inv_pred_gain_neon_intr.c
@ -217,13 +217,13 @@ opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse predi
        {
        case 24:
            t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 );
-            /* Intend to fall through */
+            /* FALLTHROUGH */

        case 16:
            t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 );
            vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) );
            vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) );
-            /* Intend to fall through */
+            /* FALLTHROUGH */

        case 8:
        {
@ -246,17 +246,17 @@ opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse predi
        case 6:
            DC_resp += (opus_int32)A_Q12[ 5 ];
            DC_resp += (opus_int32)A_Q12[ 4 ];
-            /* Intend to fall through */
+            /* FALLTHROUGH */

        case 4:
            DC_resp += (opus_int32)A_Q12[ 3 ];
            DC_resp += (opus_int32)A_Q12[ 2 ];
-            /* Intend to fall through */
+            /* FALLTHROUGH */

        case 2:
            DC_resp += (opus_int32)A_Q12[ 1 ];
            DC_resp += (opus_int32)A_Q12[ 0 ];
-            /* Intend to fall through */
+            /* FALLTHROUGH */

        default:
            break;
--- a/media/libopus/silk/check_control_input.c
+++ b/media/libopus/silk/check_control_input.c
@ -38,7 +38,7 @@ opus_int check_control_input(
    silk_EncControlStruct        *encControl                    /* I    Control structure                           */
 )
 {
-    silk_assert( encControl != NULL );
+    celt_assert( encControl != NULL );

    if( ( ( encControl->API_sampleRate            !=  8000 ) &&
          ( encControl->API_sampleRate            != 12000 ) &&
@ -59,46 +59,46 @@ opus_int check_control_input(
          ( encControl->minInternalSampleRate > encControl->desiredInternalSampleRate ) ||
          ( encControl->maxInternalSampleRate < encControl->desiredInternalSampleRate ) ||
          ( encControl->minInternalSampleRate > encControl->maxInternalSampleRate ) ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_FS_NOT_SUPPORTED;
    }
    if( encControl->payloadSize_ms != 10 &&
        encControl->payloadSize_ms != 20 &&
        encControl->payloadSize_ms != 40 &&
        encControl->payloadSize_ms != 60 ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_PACKET_SIZE_NOT_SUPPORTED;
    }
    if( encControl->packetLossPercentage < 0 || encControl->packetLossPercentage > 100 ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_LOSS_RATE;
    }
    if( encControl->useDTX < 0 || encControl->useDTX > 1 ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_DTX_SETTING;
    }
    if( encControl->useCBR < 0 || encControl->useCBR > 1 ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_CBR_SETTING;
    }
    if( encControl->useInBandFEC < 0 || encControl->useInBandFEC > 1 ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_INBAND_FEC_SETTING;
    }
    if( encControl->nChannelsAPI < 1 || encControl->nChannelsAPI > ENCODER_NUM_CHANNELS ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_NUMBER_OF_CHANNELS_ERROR;
    }
    if( encControl->nChannelsInternal < 1 || encControl->nChannelsInternal > ENCODER_NUM_CHANNELS ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_NUMBER_OF_CHANNELS_ERROR;
    }
    if( encControl->nChannelsInternal > encControl->nChannelsAPI ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_NUMBER_OF_CHANNELS_ERROR;
    }
    if( encControl->complexity < 0 || encControl->complexity > 10 ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        return SILK_ENC_INVALID_COMPLEXITY_SETTING;
    }

--- a/media/libopus/silk/control_SNR.c
+++ b/media/libopus/silk/control_SNR.c
@ -32,44 +32,82 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "main.h"
 #include "tuning_parameters.h"

+/* These tables hold SNR values divided by 21 (so they fit in 8 bits)
+   for different target bitrates spaced at 400 bps interval. The first
+   10 values are omitted (0-4 kb/s) because they're all zeros.
+   These tables were obtained by running different SNRs through the
+   encoder and measuring the active bitrate. */
+static const unsigned char silk_TargetRate_NB_21[117 - 10] = {
+                                              0, 15, 39, 52, 61, 68,
+     74, 79, 84, 88, 92, 95, 99,102,105,108,111,114,117,119,122,124,
+    126,129,131,133,135,137,139,142,143,145,147,149,151,153,155,157,
+    158,160,162,163,165,167,168,170,171,173,174,176,177,179,180,182,
+    183,185,186,187,189,190,192,193,194,196,197,199,200,201,203,204,
+    205,207,208,209,211,212,213,215,216,217,219,220,221,223,224,225,
+    227,228,230,231,232,234,235,236,238,239,241,242,243,245,246,248,
+    249,250,252,253,255
+};
+
+static const unsigned char silk_TargetRate_MB_21[165 - 10] = {
+                                              0,  0, 28, 43, 52, 59,
+     65, 70, 74, 78, 81, 85, 87, 90, 93, 95, 98,100,102,105,107,109,
+    111,113,115,116,118,120,122,123,125,127,128,130,131,133,134,136,
+    137,138,140,141,143,144,145,147,148,149,151,152,153,154,156,157,
+    158,159,160,162,163,164,165,166,167,168,169,171,172,173,174,175,
+    176,177,178,179,180,181,182,183,184,185,186,187,188,188,189,190,
+    191,192,193,194,195,196,197,198,199,200,201,202,203,203,204,205,
+    206,207,208,209,210,211,212,213,214,214,215,216,217,218,219,220,
+    221,222,223,224,224,225,226,227,228,229,230,231,232,233,234,235,
+    236,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,
+    251,252,253,254,255
+};
+
+static const unsigned char silk_TargetRate_WB_21[201 - 10] = {
+                                              0,  0,  0,  8, 29, 41,
+     49, 56, 62, 66, 70, 74, 77, 80, 83, 86, 88, 91, 93, 95, 97, 99,
+    101,103,105,107,108,110,112,113,115,116,118,119,121,122,123,125,
+    126,127,129,130,131,132,134,135,136,137,138,140,141,142,143,144,
+    145,146,147,148,149,150,151,152,153,154,156,157,158,159,159,160,
+    161,162,163,164,165,166,167,168,169,170,171,171,172,173,174,175,
+    176,177,177,178,179,180,181,181,182,183,184,185,185,186,187,188,
+    189,189,190,191,192,192,193,194,195,195,196,197,198,198,199,200,
+    200,201,202,203,203,204,205,206,206,207,208,209,209,210,211,211,
+    212,213,214,214,215,216,216,217,218,219,219,220,221,221,222,223,
+    224,224,225,226,226,227,228,229,229,230,231,232,232,233,234,234,
+    235,236,237,237,238,239,240,240,241,242,243,243,244,245,246,246,
+    247,248,249,249,250,251,252,253,255
+};
+
 /* Control SNR of redidual quantizer */
 opus_int silk_control_SNR(
    silk_encoder_state          *psEncC,                        /* I/O  Pointer to Silk encoder state               */
    opus_int32                  TargetRate_bps                  /* I    Target max bitrate (bps)                    */
 )
 {
-    opus_int k, ret = SILK_NO_ERROR;
-    opus_int32 frac_Q6;
-    const opus_int32 *rateTable;
+    int id;
+    int bound;
+    const unsigned char *snr_table;

-    /* Set bitrate/coding quality */
-    TargetRate_bps = silk_LIMIT( TargetRate_bps, MIN_TARGET_RATE_BPS, MAX_TARGET_RATE_BPS );
-    if( TargetRate_bps != psEncC->TargetRate_bps ) {
    psEncC->TargetRate_bps = TargetRate_bps;
-
-        /* If new TargetRate_bps, translate to SNR_dB value */
-        if( psEncC->fs_kHz == 8 ) {
-            rateTable = silk_TargetRate_table_NB;
-        } else if( psEncC->fs_kHz == 12 ) {
-            rateTable = silk_TargetRate_table_MB;
-        } else {
-            rateTable = silk_TargetRate_table_WB;
-        }
-
-        /* Reduce bitrate for 10 ms modes in these calculations */
    if( psEncC->nb_subfr == 2 ) {
-            TargetRate_bps -= REDUCE_BITRATE_10_MS_BPS;
+        TargetRate_bps -= 2000 + psEncC->fs_kHz/16;
    }
-
-        /* Find bitrate interval in table and interpolate */
-        for( k = 1; k < TARGET_RATE_TAB_SZ; k++ ) {
-            if( TargetRate_bps <= rateTable[ k ] ) {
-                frac_Q6 = silk_DIV32( silk_LSHIFT( TargetRate_bps - rateTable[ k - 1 ], 6 ), rateTable[ k ] - rateTable[ k - 1 ] );
-                psEncC->SNR_dB_Q7 = silk_LSHIFT( silk_SNR_table_Q1[ k - 1 ], 6 ) + silk_MUL( frac_Q6, silk_SNR_table_Q1[ k ] - silk_SNR_table_Q1[ k - 1 ] );
-                break;
+    if( psEncC->fs_kHz == 8 ) {
+        bound = sizeof(silk_TargetRate_NB_21);
+        snr_table = silk_TargetRate_NB_21;
+    } else if( psEncC->fs_kHz == 12 ) {
+        bound = sizeof(silk_TargetRate_MB_21);
+        snr_table = silk_TargetRate_MB_21;
+    } else {
+        bound = sizeof(silk_TargetRate_WB_21);
+        snr_table = silk_TargetRate_WB_21;
    }
+    id = (TargetRate_bps+200)/400;
+    id = silk_min(id - 10, bound-1);
+    if( id <= 0 ) {
+        psEncC->SNR_dB_Q7 = 0;
+    } else {
+        psEncC->SNR_dB_Q7 = snr_table[id]*21;
    }
-    }
-
-    return ret;
+    return SILK_NO_ERROR;
 }
--- a/media/libopus/silk/control_audio_bandwidth.c
+++ b/media/libopus/silk/control_audio_bandwidth.c
@ -39,9 +39,15 @@ opus_int silk_control_audio_bandwidth(
 )
 {
    opus_int   fs_kHz;
+    opus_int   orig_kHz;
    opus_int32 fs_Hz;

-    fs_kHz = psEncC->fs_kHz;
+    orig_kHz = psEncC->fs_kHz;
+    /* Handle a bandwidth-switching reset where we need to be aware what the last sampling rate was. */
+    if( orig_kHz == 0 ) {
+        orig_kHz = psEncC->sLP.saved_fs_kHz;
+    }
+    fs_kHz = orig_kHz;
    fs_Hz = silk_SMULBB( fs_kHz, 1000 );
    if( fs_Hz == 0 ) {
        /* Encoder has just been initialized */
@ -61,7 +67,7 @@ opus_int silk_control_audio_bandwidth(
        }
        if( psEncC->allow_bandwidth_switch || encControl->opusCanSwitch ) {
            /* Check if we should switch down */
-            if( silk_SMULBB( psEncC->fs_kHz, 1000 ) > psEncC->desiredInternal_fs_Hz )
+            if( silk_SMULBB( orig_kHz, 1000 ) > psEncC->desiredInternal_fs_Hz )
            {
                /* Switch down */
                if( psEncC->sLP.mode == 0 ) {
@ -76,7 +82,7 @@ opus_int silk_control_audio_bandwidth(
                    psEncC->sLP.mode = 0;

                    /* Switch to a lower sample frequency */
-                    fs_kHz = psEncC->fs_kHz == 16 ? 12 : 8;
+                    fs_kHz = orig_kHz == 16 ? 12 : 8;
                } else {
                   if( psEncC->sLP.transition_frame_no <= 0 ) {
                       encControl->switchReady = 1;
@ -90,12 +96,12 @@ opus_int silk_control_audio_bandwidth(
            }
            else
            /* Check if we should switch up */
-            if( silk_SMULBB( psEncC->fs_kHz, 1000 ) < psEncC->desiredInternal_fs_Hz )
+            if( silk_SMULBB( orig_kHz, 1000 ) < psEncC->desiredInternal_fs_Hz )
            {
                /* Switch up */
                if( encControl->opusCanSwitch ) {
                    /* Switch to a higher sample frequency */
-                    fs_kHz = psEncC->fs_kHz == 8 ? 12 : 16;
+                    fs_kHz = orig_kHz == 8 ? 12 : 16;

                    /* New transition */
                    psEncC->sLP.transition_frame_no = 0;
--- a/media/libopus/silk/control_codec.c
+++ b/media/libopus/silk/control_codec.c
@ -238,8 +238,8 @@ static opus_int silk_setup_fs(
    }

    /* Set internal sampling frequency */
-    silk_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 );
-    silk_assert( psEnc->sCmn.nb_subfr == 2 || psEnc->sCmn.nb_subfr == 4 );
+    celt_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 );
+    celt_assert( psEnc->sCmn.nb_subfr == 2 || psEnc->sCmn.nb_subfr == 4 );
    if( psEnc->sCmn.fs_kHz != fs_kHz ) {
        /* reset part of the state */
        silk_memset( &psEnc->sShape,               0, sizeof( psEnc->sShape ) );
@ -299,7 +299,7 @@ static opus_int silk_setup_fs(
    }

    /* Check that settings are valid */
-    silk_assert( ( psEnc->sCmn.subfr_length * psEnc->sCmn.nb_subfr ) == psEnc->sCmn.frame_length );
+    celt_assert( ( psEnc->sCmn.subfr_length * psEnc->sCmn.nb_subfr ) == psEnc->sCmn.frame_length );

    return ret;
 }
@ -312,7 +312,7 @@ static opus_int silk_setup_complexity(
    opus_int ret = 0;

    /* Set encoding complexity */
-    silk_assert( Complexity >= 0 && Complexity <= 10 );
+    celt_assert( Complexity >= 0 && Complexity <= 10 );
    if( Complexity < 1 ) {
        psEncC->pitchEstimationComplexity       = SILK_PE_MIN_COMPLEX;
        psEncC->pitchEstimationThreshold_Q16    = SILK_FIX_CONST( 0.8, 16 );
@ -390,12 +390,12 @@ static opus_int silk_setup_complexity(
    psEncC->shapeWinLength          = SUB_FRAME_LENGTH_MS * psEncC->fs_kHz + 2 * psEncC->la_shape;
    psEncC->Complexity              = Complexity;

-    silk_assert( psEncC->pitchEstimationLPCOrder <= MAX_FIND_PITCH_LPC_ORDER );
-    silk_assert( psEncC->shapingLPCOrder         <= MAX_SHAPE_LPC_ORDER      );
-    silk_assert( psEncC->nStatesDelayedDecision  <= MAX_DEL_DEC_STATES       );
-    silk_assert( psEncC->warping_Q16             <= 32767                    );
-    silk_assert( psEncC->la_shape                <= LA_SHAPE_MAX             );
-    silk_assert( psEncC->shapeWinLength          <= SHAPE_LPC_WIN_MAX        );
+    celt_assert( psEncC->pitchEstimationLPCOrder <= MAX_FIND_PITCH_LPC_ORDER );
+    celt_assert( psEncC->shapingLPCOrder         <= MAX_SHAPE_LPC_ORDER      );
+    celt_assert( psEncC->nStatesDelayedDecision  <= MAX_DEL_DEC_STATES       );
+    celt_assert( psEncC->warping_Q16             <= 32767                    );
+    celt_assert( psEncC->la_shape                <= LA_SHAPE_MAX             );
+    celt_assert( psEncC->shapeWinLength          <= SHAPE_LPC_WIN_MAX        );

    return ret;
 }
--- a/media/libopus/silk/dec_API.c
+++ b/media/libopus/silk/dec_API.c
@ -104,7 +104,7 @@ opus_int silk_Decode(                                   /* O    Returns error co
    int delay_stack_alloc;
    SAVE_STACK;

-    silk_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 );
+    celt_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 );

    /**********************************/
    /* Test if first frame in payload */
@ -143,13 +143,13 @@ opus_int silk_Decode(                                   /* O    Returns error co
                channel_state[ n ].nFramesPerPacket = 3;
                channel_state[ n ].nb_subfr = 4;
            } else {
-                silk_assert( 0 );
+                celt_assert( 0 );
                RESTORE_STACK;
                return SILK_DEC_INVALID_FRAME_SIZE;
            }
            fs_kHz_dec = ( decControl->internalSampleRate >> 10 ) + 1;
            if( fs_kHz_dec != 8 && fs_kHz_dec != 12 && fs_kHz_dec != 16 ) {
-                silk_assert( 0 );
+                celt_assert( 0 );
                RESTORE_STACK;
                return SILK_DEC_INVALID_SAMPLING_FREQUENCY;
            }
--- a/media/libopus/silk/decode_core.c
+++ b/media/libopus/silk/decode_core.c
@ -141,7 +141,7 @@ void silk_decode_core(
            if( k == 0 || ( k == 2 && NLSF_interpolation_flag ) ) {
                /* Rewhiten with new A coefs */
                start_idx = psDec->ltp_mem_length - lag - psDec->LPC_order - LTP_ORDER / 2;
-                silk_assert( start_idx > 0 );
+                celt_assert( start_idx > 0 );

                if( k == 2 ) {
                    silk_memcpy( &psDec->outBuf[ psDec->ltp_mem_length ], xq, 2 * psDec->subfr_length * sizeof( opus_int16 ) );
@ -196,7 +196,7 @@ void silk_decode_core(

        for( i = 0; i < psDec->subfr_length; i++ ) {
            /* Short-term prediction */
-            silk_assert( psDec->LPC_order == 10 || psDec->LPC_order == 16 );
+            celt_assert( psDec->LPC_order == 10 || psDec->LPC_order == 16 );
            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
            LPC_pred_Q10 = silk_RSHIFT( psDec->LPC_order, 1 );
            LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i -  1 ], A_Q12_tmp[ 0 ] );
--- a/media/libopus/silk/decode_frame.c
+++ b/media/libopus/silk/decode_frame.c
@ -55,7 +55,7 @@ opus_int silk_decode_frame(
    psDecCtrl->LTP_scale_Q14 = 0;

    /* Safety checks */
-    silk_assert( L > 0 && L <= MAX_FRAME_LENGTH );
+    celt_assert( L > 0 && L <= MAX_FRAME_LENGTH );

    if(   lostFlag == FLAG_DECODE_NORMAL ||
        ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) )
@ -91,7 +91,7 @@ opus_int silk_decode_frame(

        psDec->lossCnt = 0;
        psDec->prevSignalType = psDec->indices.signalType;
-        silk_assert( psDec->prevSignalType >= 0 && psDec->prevSignalType <= 2 );
+        celt_assert( psDec->prevSignalType >= 0 && psDec->prevSignalType <= 2 );

        /* A frame has been decoded without errors */
        psDec->first_frame_after_reset = 0;
@ -104,7 +104,7 @@ opus_int silk_decode_frame(
    /*************************/
    /* Update output buffer. */
    /*************************/
-    silk_assert( psDec->ltp_mem_length >= psDec->frame_length );
+    celt_assert( psDec->ltp_mem_length >= psDec->frame_length );
    mv_len = psDec->ltp_mem_length - psDec->frame_length;
    silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
    silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
--- a/media/libopus/silk/decode_indices.c
+++ b/media/libopus/silk/decode_indices.c
@ -79,7 +79,7 @@ void silk_decode_indices(
    /**********************/
    psDec->indices.NLSFIndices[ 0 ] = (opus_int8)ec_dec_icdf( psRangeDec, &psDec->psNLSF_CB->CB1_iCDF[ ( psDec->indices.signalType >> 1 ) * psDec->psNLSF_CB->nVectors ], 8 );
    silk_NLSF_unpack( ec_ix, pred_Q8, psDec->psNLSF_CB, psDec->indices.NLSFIndices[ 0 ] );
-    silk_assert( psDec->psNLSF_CB->order == psDec->LPC_order );
+    celt_assert( psDec->psNLSF_CB->order == psDec->LPC_order );
    for( i = 0; i < psDec->psNLSF_CB->order; i++ ) {
        Ix = ec_dec_icdf( psRangeDec, &psDec->psNLSF_CB->ec_iCDF[ ec_ix[ i ] ], 8 );
        if( Ix == 0 ) {
--- a/media/libopus/silk/decode_pitch.c
+++ b/media/libopus/silk/decode_pitch.c
@ -51,7 +51,7 @@ void silk_decode_pitch(
            Lag_CB_ptr = &silk_CB_lags_stage2[ 0 ][ 0 ];
            cbk_size   = PE_NB_CBKS_STAGE2_EXT;
        } else {
-            silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1 );
+            celt_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1 );
            Lag_CB_ptr = &silk_CB_lags_stage2_10_ms[ 0 ][ 0 ];
            cbk_size   = PE_NB_CBKS_STAGE2_10MS;
        }
@ -60,7 +60,7 @@ void silk_decode_pitch(
            Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ];
            cbk_size   = PE_NB_CBKS_STAGE3_MAX;
        } else {
-            silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1 );
+            celt_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1 );
            Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ];
            cbk_size   = PE_NB_CBKS_STAGE3_10MS;
        }
--- a/media/libopus/silk/decode_pulses.c
+++ b/media/libopus/silk/decode_pulses.c
@ -56,7 +56,7 @@ void silk_decode_pulses(
    silk_assert( 1 << LOG2_SHELL_CODEC_FRAME_LENGTH == SHELL_CODEC_FRAME_LENGTH );
    iter = silk_RSHIFT( frame_length, LOG2_SHELL_CODEC_FRAME_LENGTH );
    if( iter * SHELL_CODEC_FRAME_LENGTH < frame_length ) {
-        silk_assert( frame_length == 12 * 10 ); /* Make sure only happens for 10 ms @ 12 kHz */
+        celt_assert( frame_length == 12 * 10 ); /* Make sure only happens for 10 ms @ 12 kHz */
        iter++;
    }

--- a/media/libopus/silk/decoder_set_fs.c
+++ b/media/libopus/silk/decoder_set_fs.c
@ -40,8 +40,8 @@ opus_int silk_decoder_set_fs(
 {
    opus_int frame_length, ret = 0;

-    silk_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 );
-    silk_assert( psDec->nb_subfr == MAX_NB_SUBFR || psDec->nb_subfr == MAX_NB_SUBFR/2 );
+    celt_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 );
+    celt_assert( psDec->nb_subfr == MAX_NB_SUBFR || psDec->nb_subfr == MAX_NB_SUBFR/2 );

    /* New (sub)frame length */
    psDec->subfr_length = silk_SMULBB( SUB_FRAME_LENGTH_MS, fs_kHz );
@ -86,7 +86,7 @@ opus_int silk_decoder_set_fs(
                psDec->pitch_lag_low_bits_iCDF = silk_uniform4_iCDF;
            } else {
                /* unsupported sampling rate */
-                silk_assert( 0 );
+                celt_assert( 0 );
            }
            psDec->first_frame_after_reset = 1;
            psDec->lagPrev                 = 100;
@ -101,7 +101,7 @@ opus_int silk_decoder_set_fs(
    }

    /* Check that settings are valid */
-    silk_assert( psDec->frame_length > 0 && psDec->frame_length <= MAX_FRAME_LENGTH );
+    celt_assert( psDec->frame_length > 0 && psDec->frame_length <= MAX_FRAME_LENGTH );

    return ret;
 }
--- a/media/libopus/silk/define.h
+++ b/media/libopus/silk/define.h
@ -46,7 +46,6 @@ extern "C"
 /* Limits on bitrate */
 #define MIN_TARGET_RATE_BPS                     5000
 #define MAX_TARGET_RATE_BPS                     80000
-#define TARGET_RATE_TAB_SZ                      8

 /* LBRR thresholds */
 #define LBRR_NB_MIN_RATE_BPS                    12000
@ -58,6 +57,11 @@ extern "C"
 #define MAX_CONSECUTIVE_DTX                     20      /* eq 400 ms */
 #define DTX_ACTIVITY_THRESHOLD                  0.1f

+/* VAD decision */
+#define VAD_NO_DECISION                         -1
+#define VAD_NO_ACTIVITY                         0
+#define VAD_ACTIVITY                            1
+
 /* Maximum sampling frequency */
 #define MAX_FS_KHZ                              16
 #define MAX_API_FS_KHZ                          48
--- a/media/libopus/silk/enc_API.c
+++ b/media/libopus/silk/enc_API.c
@ -82,7 +82,7 @@ opus_int silk_InitEncoder(                              /* O    Returns error co
    silk_memset( psEnc, 0, sizeof( silk_encoder ) );
    for( n = 0; n < ENCODER_NUM_CHANNELS; n++ ) {
        if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ], arch ) ) {
-            silk_assert( 0 );
+            celt_assert( 0 );
        }
    }

@ -91,7 +91,7 @@ opus_int silk_InitEncoder(                              /* O    Returns error co

    /* Read control structure */
    if( ret += silk_QueryEncoder( encState, encStatus ) ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
    }

    return ret;
@ -144,7 +144,8 @@ opus_int silk_Encode(                                   /* O    Returns error co
    opus_int                        nSamplesIn,         /* I    Number of samples in input vector               */
    ec_enc                          *psRangeEnc,        /* I/O  Compressor data structure                       */
    opus_int32                      *nBytesOut,         /* I/O  Number of bytes in payload (input: Max bytes)   */
-    const opus_int                  prefillFlag         /* I    Flag to indicate prefilling buffers no coding   */
+    const opus_int                  prefillFlag,        /* I    Flag to indicate prefilling buffers no coding   */
+    opus_int                        activity            /* I    Decision of Opus voice activity detector        */
 )
 {
    opus_int   n, i, nBits, flags, tmp_payloadSize_ms = 0, tmp_complexity = 0, ret = 0;
@ -166,7 +167,7 @@ opus_int silk_Encode(                                   /* O    Returns error co

    /* Check values in encoder control structure */
    if( ( ret = check_control_input( encControl ) ) != 0 ) {
-        silk_assert( 0 );
+        celt_assert( 0 );
        RESTORE_STACK;
        return ret;
    }
@ -199,16 +200,26 @@ opus_int silk_Encode(                                   /* O    Returns error co
    tot_blocks = ( nBlocksOf10ms > 1 ) ? nBlocksOf10ms >> 1 : 1;
    curr_block = 0;
    if( prefillFlag ) {
+        silk_LP_state save_LP;
        /* Only accept input length of 10 ms */
        if( nBlocksOf10ms != 1 ) {
-            silk_assert( 0 );
+            celt_assert( 0 );
            RESTORE_STACK;
            return SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES;
        }
+        if ( prefillFlag == 2 ) {
+            save_LP = psEnc->state_Fxx[ 0 ].sCmn.sLP;
+            /* Save the sampling rate so the bandwidth switching code can keep handling transitions. */
+            save_LP.saved_fs_kHz = psEnc->state_Fxx[ 0 ].sCmn.fs_kHz;
+        }
        /* Reset Encoder */
        for( n = 0; n < encControl->nChannelsInternal; n++ ) {
            ret = silk_init_encoder( &psEnc->state_Fxx[ n ], psEnc->state_Fxx[ n ].sCmn.arch );
-            silk_assert( !ret );
+            /* Restore the variable LP state. */
+            if ( prefillFlag == 2 ) {
+                psEnc->state_Fxx[ n ].sCmn.sLP = save_LP;
+            }
+            celt_assert( !ret );
        }
        tmp_payloadSize_ms = encControl->payloadSize_ms;
        encControl->payloadSize_ms = 10;
@ -221,13 +232,13 @@ opus_int silk_Encode(                                   /* O    Returns error co
    } else {
        /* Only accept input lengths that are a multiple of 10 ms */
        if( nBlocksOf10ms * encControl->API_sampleRate != 100 * nSamplesIn || nSamplesIn < 0 ) {
-            silk_assert( 0 );
+            celt_assert( 0 );
            RESTORE_STACK;
            return SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES;
        }
        /* Make sure no more than one packet can be produced */
        if( 1000 * (opus_int32)nSamplesIn > encControl->payloadSize_ms * encControl->API_sampleRate ) {
-            silk_assert( 0 );
+            celt_assert( 0 );
            RESTORE_STACK;
            return SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES;
        }
@ -248,7 +259,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
        }
        psEnc->state_Fxx[ n ].sCmn.inDTX = psEnc->state_Fxx[ n ].sCmn.useDTX;
    }
-    silk_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );
+    celt_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );

    /* Input buffering/resampling and encoding */
    nSamplesToBufferMax =
@ -306,7 +317,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
            }
            psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer;
        } else {
-            silk_assert( encControl->nChannelsAPI == 1 && encControl->nChannelsInternal == 1 );
+            celt_assert( encControl->nChannelsAPI == 1 && encControl->nChannelsInternal == 1 );
            silk_memcpy(buf, samplesIn, nSamplesFromInput*sizeof(opus_int16));
            ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
                &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
@ -322,8 +333,8 @@ opus_int silk_Encode(                                   /* O    Returns error co
        /* Silk encoder */
        if( psEnc->state_Fxx[ 0 ].sCmn.inputBufIx >= psEnc->state_Fxx[ 0 ].sCmn.frame_length ) {
            /* Enough data in input buffer, so encode */
-            silk_assert( psEnc->state_Fxx[ 0 ].sCmn.inputBufIx == psEnc->state_Fxx[ 0 ].sCmn.frame_length );
-            silk_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 1 ].sCmn.inputBufIx == psEnc->state_Fxx[ 1 ].sCmn.frame_length );
+            celt_assert( psEnc->state_Fxx[ 0 ].sCmn.inputBufIx == psEnc->state_Fxx[ 0 ].sCmn.frame_length );
+            celt_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 1 ].sCmn.inputBufIx == psEnc->state_Fxx[ 1 ].sCmn.frame_length );

            /* Deal with LBRR data */
            if( psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded == 0 && !prefillFlag ) {
@ -425,7 +436,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                        psEnc->state_Fxx[ 1 ].sCmn.sNSQ.prev_gain_Q16      = 65536;
                        psEnc->state_Fxx[ 1 ].sCmn.first_frame_after_reset = 1;
                    }
-                    silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ] );
+                    silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ], activity );
                } else {
                    psEnc->state_Fxx[ 1 ].sCmn.VAD_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] = 0;
                }
@ -440,7 +451,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                silk_memcpy( psEnc->state_Fxx[ 0 ].sCmn.inputBuf, psEnc->sStereo.sMid, 2 * sizeof( opus_int16 ) );
                silk_memcpy( psEnc->sStereo.sMid, &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.frame_length ], 2 * sizeof( opus_int16 ) );
            }
-            silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ] );
+            silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ], activity );

            /* Encode */
            for( n = 0; n < encControl->nChannelsInternal; n++ ) {
--- a/media/libopus/silk/encode_indices.c
+++ b/media/libopus/silk/encode_indices.c
@ -56,8 +56,8 @@ void silk_encode_indices(
    /* Encode signal type and quantizer offset */
    /*******************************************/
    typeOffset = 2 * psIndices->signalType + psIndices->quantOffsetType;
-    silk_assert( typeOffset >= 0 && typeOffset < 6 );
-    silk_assert( encode_LBRR == 0 || typeOffset >= 2 );
+    celt_assert( typeOffset >= 0 && typeOffset < 6 );
+    celt_assert( encode_LBRR == 0 || typeOffset >= 2 );
    if( encode_LBRR || typeOffset >= 2 ) {
        ec_enc_icdf( psRangeEnc, typeOffset - 2, silk_type_offset_VAD_iCDF, 8 );
    } else {
@ -90,7 +90,7 @@ void silk_encode_indices(
    /****************/
    ec_enc_icdf( psRangeEnc, psIndices->NLSFIndices[ 0 ], &psEncC->psNLSF_CB->CB1_iCDF[ ( psIndices->signalType >> 1 ) * psEncC->psNLSF_CB->nVectors ], 8 );
    silk_NLSF_unpack( ec_ix, pred_Q8, psEncC->psNLSF_CB, psIndices->NLSFIndices[ 0 ] );
-    silk_assert( psEncC->psNLSF_CB->order == psEncC->predictLPCOrder );
+    celt_assert( psEncC->psNLSF_CB->order == psEncC->predictLPCOrder );
    for( i = 0; i < psEncC->psNLSF_CB->order; i++ ) {
        if( psIndices->NLSFIndices[ i+1 ] >= NLSF_QUANT_MAX_AMPLITUDE ) {
            ec_enc_icdf( psRangeEnc, 2 * NLSF_QUANT_MAX_AMPLITUDE, &psEncC->psNLSF_CB->ec_iCDF[ ec_ix[ i ] ], 8 );
--- a/media/libopus/silk/encode_pulses.c
+++ b/media/libopus/silk/encode_pulses.c
@ -86,7 +86,7 @@ void silk_encode_pulses(
    silk_assert( 1 << LOG2_SHELL_CODEC_FRAME_LENGTH == SHELL_CODEC_FRAME_LENGTH );
    iter = silk_RSHIFT( frame_length, LOG2_SHELL_CODEC_FRAME_LENGTH );
    if( iter * SHELL_CODEC_FRAME_LENGTH < frame_length ) {
-        silk_assert( frame_length == 12 * 10 ); /* Make sure only happens for 10 ms @ 12 kHz */
+        celt_assert( frame_length == 12 * 10 ); /* Make sure only happens for 10 ms @ 12 kHz */
        iter++;
        silk_memset( &pulses[ frame_length ], 0, SHELL_CODEC_FRAME_LENGTH * sizeof(opus_int8));
    }
--- a/media/libopus/silk/fixed/apply_sine_window_FIX.c
+++ b/media/libopus/silk/fixed/apply_sine_window_FIX.c
@ -57,15 +57,15 @@ void silk_apply_sine_window(
    opus_int   k, f_Q16, c_Q16;
    opus_int32 S0_Q16, S1_Q16;

-    silk_assert( win_type == 1 || win_type == 2 );
+    celt_assert( win_type == 1 || win_type == 2 );

    /* Length must be in a range from 16 to 120 and a multiple of 4 */
-    silk_assert( length >= 16 && length <= 120 );
-    silk_assert( ( length & 3 ) == 0 );
+    celt_assert( length >= 16 && length <= 120 );
+    celt_assert( ( length & 3 ) == 0 );

    /* Frequency */
    k = ( length >> 2 ) - 4;
-    silk_assert( k >= 0 && k <= 26 );
+    celt_assert( k >= 0 && k <= 26 );
    f_Q16 = (opus_int)freq_table_Q16[ k ];

    /* Factor used for cosine approximation */
--- a/media/libopus/silk/fixed/burg_modified_FIX.c
+++ b/media/libopus/silk/fixed/burg_modified_FIX.c
@ -65,7 +65,7 @@ void silk_burg_modified_c(
    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
    opus_int64       C0_64;

-    silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
+    celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );

    /* Compute autocorrelations, added over subframes */
    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
--- a/media/libopus/silk/fixed/encode_frame_FIX.c
+++ b/media/libopus/silk/fixed/encode_frame_FIX.c
@ -43,21 +43,28 @@ static OPUS_INLINE void silk_LBRR_encode_FIX(
 );

 void silk_encode_do_VAD_FIX(
-    silk_encoder_state_FIX          *psEnc                                  /* I/O  Pointer to Silk FIX encoder state                                           */
+    silk_encoder_state_FIX          *psEnc,                                 /* I/O  Pointer to Silk FIX encoder state                                           */
+    opus_int                        activity                                /* I    Decision of Opus voice activity detector                                    */
 )
 {
+    const opus_int activity_threshold = SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 );
+
    /****************************/
    /* Voice Activity Detection */
    /****************************/
    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch );
+    /* If Opus VAD is inactive and Silk VAD is active: lower Silk VAD to just under the threshold */
+    if( activity == VAD_NO_ACTIVITY && psEnc->sCmn.speech_activity_Q8 >= activity_threshold ) {
+        psEnc->sCmn.speech_activity_Q8 = activity_threshold - 1;
+    }

    /**************************************************/
    /* Convert speech activity into VAD and DTX flags */
    /**************************************************/
-    if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) {
+    if( psEnc->sCmn.speech_activity_Q8 < activity_threshold ) {
        psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY;
        psEnc->sCmn.noSpeechCounter++;
-        if( psEnc->sCmn.noSpeechCounter < NB_SPEECH_FRAMES_BEFORE_DTX ) {
+        if( psEnc->sCmn.noSpeechCounter <= NB_SPEECH_FRAMES_BEFORE_DTX ) {
            psEnc->sCmn.inDTX = 0;
        } else if( psEnc->sCmn.noSpeechCounter > MAX_CONSECUTIVE_DTX + NB_SPEECH_FRAMES_BEFORE_DTX ) {
            psEnc->sCmn.noSpeechCounter = NB_SPEECH_FRAMES_BEFORE_DTX;
@ -255,7 +262,7 @@ opus_int silk_encode_frame_FIX(
                if( found_lower && ( gainsID == gainsID_lower || nBits > maxBits ) ) {
                    /* Restore output state from earlier iteration that did meet the bitrate budget */
                    silk_memcpy( psRangeEnc, &sRangeEnc_copy2, sizeof( ec_enc ) );
-                    silk_assert( sRangeEnc_copy2.offs <= 1275 );
+                    celt_assert( sRangeEnc_copy2.offs <= 1275 );
                    silk_memcpy( psRangeEnc->buf, ec_buf_copy, sRangeEnc_copy2.offs );
                    silk_memcpy( &psEnc->sCmn.sNSQ, &sNSQ_copy2, sizeof( silk_nsq_state ) );
                    psEnc->sShape.LastGainIndex = LastGainIndex_copy2;
@ -283,7 +290,7 @@ opus_int silk_encode_frame_FIX(
                    gainsID_lower = gainsID;
                    /* Copy part of the output state */
                    silk_memcpy( &sRangeEnc_copy2, psRangeEnc, sizeof( ec_enc ) );
-                    silk_assert( psRangeEnc->offs <= 1275 );
+                    celt_assert( psRangeEnc->offs <= 1275 );
                    silk_memcpy( ec_buf_copy, psRangeEnc->buf, psRangeEnc->offs );
                    silk_memcpy( &sNSQ_copy2, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) );
                    LastGainIndex_copy2 = psEnc->sShape.LastGainIndex;
--- a/media/libopus/silk/fixed/find_LPC_FIX.c
+++ b/media/libopus/silk/fixed/find_LPC_FIX.c
@ -146,6 +146,6 @@ void silk_find_LPC_FIX(
        silk_A2NLSF( NLSF_Q15, a_Q16, psEncC->predictLPCOrder );
    }

-    silk_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 || ( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) );
+    celt_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 || ( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) );
    RESTORE_STACK;
 }
--- a/media/libopus/silk/fixed/find_pitch_lags_FIX.c
+++ b/media/libopus/silk/fixed/find_pitch_lags_FIX.c
@ -59,7 +59,7 @@ void silk_find_pitch_lags_FIX(
    buf_len = psEnc->sCmn.la_pitch + psEnc->sCmn.frame_length + psEnc->sCmn.ltp_mem_length;

    /* Safety check */
-    silk_assert( buf_len >= psEnc->sCmn.pitch_LPC_win_length );
+    celt_assert( buf_len >= psEnc->sCmn.pitch_LPC_win_length );

    /*************************************/
    /* Estimate LPC AR coefficients      */
--- a/media/libopus/silk/fixed/find_pred_coefs_FIX.c
+++ b/media/libopus/silk/fixed/find_pred_coefs_FIX.c
@ -80,7 +80,7 @@ void silk_find_pred_coefs_FIX(
        /**********/
        /* VOICED */
        /**********/
-        silk_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );
+        celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );

        ALLOC( xXLTP_Q17, psEnc->sCmn.nb_subfr * LTP_ORDER, opus_int32 );
        ALLOC( XXLTP_Q17, psEnc->sCmn.nb_subfr * LTP_ORDER * LTP_ORDER, opus_int32 );
--- a/media/libopus/silk/fixed/main_FIX.h
+++ b/media/libopus/silk/fixed/main_FIX.h
@ -66,7 +66,8 @@ void silk_HP_variable_cutoff(

 /* Encoder main function */
 void silk_encode_do_VAD_FIX(
-    silk_encoder_state_FIX          *psEnc                                  /* I/O  Pointer to Silk FIX encoder state                                           */
+    silk_encoder_state_FIX          *psEnc,                                 /* I/O  Pointer to Silk FIX encoder state                                           */
+    opus_int                        activity                                /* I    Decision of Opus voice activity detector                                    */
 );

 /* Encoder main function */
--- a/media/libopus/silk/fixed/pitch_analysis_core_FIX.c
+++ b/media/libopus/silk/fixed/pitch_analysis_core_FIX.c
@ -122,11 +122,11 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
    SAVE_STACK;

    /* Check for valid sampling frequency */
-    silk_assert( Fs_kHz == 8 || Fs_kHz == 12 || Fs_kHz == 16 );
+    celt_assert( Fs_kHz == 8 || Fs_kHz == 12 || Fs_kHz == 16 );

    /* Check for valid complexity setting */
-    silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
-    silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
+    celt_assert( complexity >= SILK_PE_MIN_COMPLEX );
+    celt_assert( complexity <= SILK_PE_MAX_COMPLEX );

    silk_assert( search_thres1_Q16 >= 0 && search_thres1_Q16 <= (1<<16) );
    silk_assert( search_thres2_Q13 >= 0 && search_thres2_Q13 <= (1<<13) );
@ -164,7 +164,7 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
        silk_resampler_down2_3( filt_state, frame_8kHz_buf, frame, frame_length );
        frame_8kHz = frame_8kHz_buf;
    } else {
-        silk_assert( Fs_kHz == 8 );
+        celt_assert( Fs_kHz == 8 );
        frame_8kHz = frame;
    }

@ -188,14 +188,14 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
    target_ptr = &frame_4kHz[ silk_LSHIFT( SF_LENGTH_4KHZ, 2 ) ];
    for( k = 0; k < nb_subfr >> 1; k++ ) {
        /* Check that we are within range of the array */
-        silk_assert( target_ptr >= frame_4kHz );
-        silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
+        celt_assert( target_ptr >= frame_4kHz );
+        celt_assert( target_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );

        basis_ptr = target_ptr - MIN_LAG_4KHZ;

        /* Check that we are within range of the array */
-        silk_assert( basis_ptr >= frame_4kHz );
-        silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
+        celt_assert( basis_ptr >= frame_4kHz );
+        celt_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );

        celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1, arch );

@ -249,7 +249,7 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0

    /* Sort */
    length_d_srch = silk_ADD_LSHIFT32( 4, complexity, 1 );
-    silk_assert( 3 * length_d_srch <= PE_D_SRCH_LENGTH );
+    celt_assert( 3 * length_d_srch <= PE_D_SRCH_LENGTH );
    silk_insertion_sort_decreasing_int16( C, d_srch, CSTRIDE_4KHZ,
                                          length_d_srch );

@ -274,7 +274,7 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
            break;
        }
    }
-    silk_assert( length_d_srch > 0 );
+    celt_assert( length_d_srch > 0 );

    ALLOC( d_comp, D_COMP_STRIDE, opus_int16 );
    for( i = D_COMP_MIN; i < D_COMP_MAX; i++ ) {
@ -325,8 +325,8 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
    for( k = 0; k < nb_subfr; k++ ) {

        /* Check that we are within range of the array */
-        silk_assert( target_ptr >= frame_8kHz );
-        silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
+        celt_assert( target_ptr >= frame_8kHz );
+        celt_assert( target_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );

        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch ), 1 );
        for( j = 0; j < length_d_comp; j++ ) {
@ -550,7 +550,7 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
        *lagIndex = (opus_int16)( lag - MIN_LAG_8KHZ );
        *contourIndex = (opus_int8)CBimax;
    }
-    silk_assert( *lagIndex >= 0 );
+    celt_assert( *lagIndex >= 0 );
    /* return as voiced */
    RESTORE_STACK;
    return 0;
@ -587,8 +587,8 @@ static void silk_P_Ana_calc_corr_st3(
    const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;
    SAVE_STACK;

-    silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
-    silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
+    celt_assert( complexity >= SILK_PE_MIN_COMPLEX );
+    celt_assert( complexity <= SILK_PE_MAX_COMPLEX );

    if( nb_subfr == PE_MAX_NB_SUBFR ) {
        Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ];
@ -596,7 +596,7 @@ static void silk_P_Ana_calc_corr_st3(
        nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ];
        cbk_size      = PE_NB_CBKS_STAGE3_MAX;
    } else {
-        silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
+        celt_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
        Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ];
        Lag_CB_ptr    = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ];
        nb_cbk_search = PE_NB_CBKS_STAGE3_10MS;
@ -612,7 +612,7 @@ static void silk_P_Ana_calc_corr_st3(
        /* Calculate the correlations for each subframe */
        lag_low  = matrix_ptr( Lag_range_ptr, k, 0, 2 );
        lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );
-        silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);
+        celt_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);
        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1, arch );
        for( j = lag_low; j <= lag_high; j++ ) {
            silk_assert( lag_counter < SCRATCH_SIZE );
@ -659,8 +659,8 @@ static void silk_P_Ana_calc_energy_st3(
    const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;
    SAVE_STACK;

-    silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
-    silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
+    celt_assert( complexity >= SILK_PE_MIN_COMPLEX );
+    celt_assert( complexity <= SILK_PE_MAX_COMPLEX );

    if( nb_subfr == PE_MAX_NB_SUBFR ) {
        Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ];
@ -668,7 +668,7 @@ static void silk_P_Ana_calc_energy_st3(
        nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ];
        cbk_size      = PE_NB_CBKS_STAGE3_MAX;
    } else {
-        silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
+        celt_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
        Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ];
        Lag_CB_ptr    = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ];
        nb_cbk_search = PE_NB_CBKS_STAGE3_10MS;
--- a/media/libopus/silk/fixed/residual_energy16_FIX.c
+++ b/media/libopus/silk/fixed/residual_energy16_FIX.c
@ -47,10 +47,10 @@ opus_int32 silk_residual_energy16_covar_FIX(
    const opus_int32 *pRow;

    /* Safety checks */
-    silk_assert( D >=  0 );
-    silk_assert( D <= 16 );
-    silk_assert( cQ >  0 );
-    silk_assert( cQ < 16 );
+    celt_assert( D >=  0 );
+    celt_assert( D <= 16 );
+    celt_assert( cQ >  0 );
+    celt_assert( cQ < 16 );

    lshifts = 16 - cQ;
    Qxtra = lshifts;
--- a/media/libopus/silk/fixed/residual_energy_FIX.c
+++ b/media/libopus/silk/fixed/residual_energy_FIX.c
@ -58,7 +58,7 @@ void silk_residual_energy_FIX(

    /* Filter input to create the LPC residual for each frame half, and measure subframe energies */
    ALLOC( LPC_res, ( MAX_NB_SUBFR >> 1 ) * offset, opus_int16 );
-    silk_assert( ( nb_subfr >> 1 ) * ( MAX_NB_SUBFR >> 1 ) == nb_subfr );
+    celt_assert( ( nb_subfr >> 1 ) * ( MAX_NB_SUBFR >> 1 ) == nb_subfr );
    for( i = 0; i < nb_subfr >> 1; i++ ) {
        /* Calculate half frame LPC residual signal including preceding samples */
        silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order, arch );
--- a/media/libopus/silk/fixed/schur64_FIX.c
+++ b/media/libopus/silk/fixed/schur64_FIX.c
@ -43,7 +43,7 @@ opus_int32 silk_schur64(                            /* O    returns residual ene
    opus_int32 C[ SILK_MAX_ORDER_LPC + 1 ][ 2 ];
    opus_int32 Ctmp1_Q30, Ctmp2_Q30, rc_tmp_Q31;

-    silk_assert( order >= 0 && order <= SILK_MAX_ORDER_LPC );
+    celt_assert( order >= 0 && order <= SILK_MAX_ORDER_LPC );

    /* Check for invalid input */
    if( c[ 0 ] <= 0 ) {
--- a/media/libopus/silk/fixed/schur_FIX.c
+++ b/media/libopus/silk/fixed/schur_FIX.c
@ -43,7 +43,7 @@ opus_int32 silk_schur(                              /* O    Returns residual ene
    opus_int32    C[ SILK_MAX_ORDER_LPC + 1 ][ 2 ];
    opus_int32    Ctmp1, Ctmp2, rc_tmp_Q15;

-    silk_assert( order >= 0 && order <= SILK_MAX_ORDER_LPC );
+    celt_assert( order >= 0 && order <= SILK_MAX_ORDER_LPC );

    /* Get number of leading zeros */
    lz = silk_CLZ32( c[ 0 ] );
--- a/media/libopus/silk/fixed/warped_autocorrelation_FIX.c
+++ b/media/libopus/silk/fixed/warped_autocorrelation_FIX.c
@ -52,7 +52,7 @@ void silk_warped_autocorrelation_FIX_c(
    opus_int64 corr_QC[  MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };

    /* Order must be even */
-    silk_assert( ( order & 1 ) == 0 );
+    celt_assert( ( order & 1 ) == 0 );
    silk_assert( 2 * QS - QC >= 0 );

    /* Loop over samples */
--- a/media/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c
+++ b/media/libopus/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@ -0,0 +1,377 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "SigProc_FIX.h"
+#include "define.h"
+#include "tuning_parameters.h"
+#include "pitch.h"
+#include "celt/x86/x86cpu.h"
+
+#define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
+
+#define QA                          25
+#define N_BITS_HEAD_ROOM            2
+#define MIN_RSHIFTS                 -16
+#define MAX_RSHIFTS                 (32 - QA)
+
+/* Compute reflection coefficients from input signal */
+void silk_burg_modified_sse4_1(
+    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
+    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
+    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
+    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
+    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
+    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
+    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
+    const opus_int              D,                  /* I    Order                                                       */
+    int                         arch                /* I    Run-time architecture                                       */
+)
+{
+    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
+    const opus_int16 *x_ptr;
+    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
+    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
+    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
+    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
+    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
+    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+
+    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
+    __m128i CONST1 = _mm_set1_epi32(1);
+
+    celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
+
+    /* Compute autocorrelations, added over subframes */
+    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
+    if( rshifts > MAX_RSHIFTS ) {
+        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
+        silk_assert( C0 > 0 );
+        rshifts = MAX_RSHIFTS;
+    } else {
+        lz = silk_CLZ32( C0 ) - 1;
+        rshifts_extra = N_BITS_HEAD_ROOM - lz;
+        if( rshifts_extra > 0 ) {
+            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
+            C0 = silk_RSHIFT32( C0, rshifts_extra );
+        } else {
+            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
+            C0 = silk_LSHIFT32( C0, -rshifts_extra );
+        }
+        rshifts += rshifts_extra;
+    }
+    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
+    silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
+    if( rshifts > 0 ) {
+        for( s = 0; s < nb_subfr; s++ ) {
+            x_ptr = x + s * subfr_length;
+            for( n = 1; n < D + 1; n++ ) {
+                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
+                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+            }
+        }
+    } else {
+        for( s = 0; s < nb_subfr; s++ ) {
+            int i;
+            opus_int32 d;
+            x_ptr = x + s * subfr_length;
+            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
+            for( n = 1; n < D + 1; n++ ) {
+               for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
+                  d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
+               xcorr[ n - 1 ] += d;
+            }
+            for( n = 1; n < D + 1; n++ ) {
+                C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
+            }
+        }
+    }
+    silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
+
+    /* Initialize */
+    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
+
+    invGain_Q30 = (opus_int32)1 << 30;
+    reached_max_gain = 0;
+    for( n = 0; n < D; n++ ) {
+        /* Update first row of correlation matrix (without first element) */
+        /* Update last row of correlation matrix (without last element, stored in reversed order) */
+        /* Update C * Af */
+        /* Update C * flipud(Af) (stored in reversed order) */
+        if( rshifts > -2 ) {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    16 - rshifts );        /* Q(16-rshifts) */
+                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts );        /* Q(16-rshifts) */
+                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    QA - 16 );             /* Q(QA-16) */
+                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 );             /* Q(QA-16) */
+                for( k = 0; k < n; k++ ) {
+                    C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
+                    C_last_row[ k ]  = silk_SMLAWB( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
+                    Atmp_QA = Af_QA[ k ];
+                    tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ]            );                 /* Q(QA-16) */
+                    tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] );                 /* Q(QA-16) */
+                }
+                tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
+                tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
+                for( k = 0; k <= n; k++ ) {
+                    CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ]                    );        /* Q( -rshift ) */
+                    CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] );        /* Q( -rshift ) */
+                }
+            }
+        } else {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    -rshifts );            /* Q( -rshifts ) */
+                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts );            /* Q( -rshifts ) */
+                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    17 );                  /* Q17 */
+                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 );                  /* Q17 */
+
+                X1_3210 = _mm_set1_epi32( x1 );
+                X2_3210 = _mm_set1_epi32( x2 );
+                TMP1_3210 = _mm_setzero_si128();
+                TMP2_3210 = _mm_setzero_si128();
+                for( k = 0; k < n - 3; k += 4 ) {
+                    PTR_3210   = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
+                    SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
+                    FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
+                    PTR_3210   = _mm_shuffle_epi32( PTR_3210,  _MM_SHUFFLE( 0, 1, 2, 3 ) );
+                    LAST_3210  = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
+                    ATMP_3210  = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
+
+                    T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
+                    T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
+
+                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
+                    ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
+                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
+
+                    FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
+                    LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
+
+                    PTR_3210   = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
+                    SUBFR_3210   = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
+
+                    _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
+                    _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
+
+                    TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
+                    TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
+                }
+
+                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
+                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
+                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
+                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
+
+                tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
+                tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
+
+                for( ; k < n; k++ ) {
+                    C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
+                    C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
+                    Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
+                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
+                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
+                }
+
+                tmp1 = -tmp1;                /* Q17 */
+                tmp2 = -tmp2;                /* Q17 */
+
+                {
+                    __m128i xmm_tmp1, xmm_tmp2;
+                    __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
+                    __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
+
+                    xmm_tmp1 = _mm_set1_epi32( tmp1 );
+                    xmm_tmp2 = _mm_set1_epi32( tmp2 );
+
+                    for( k = 0; k <= n - 3; k += 4 ) {
+                        xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
+                        xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
+                        xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
+
+                        /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
+                        xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+                        xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
+                        xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
+                        xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
+                        xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
+                        xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
+                        xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
+                        xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
+                        xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
+
+                        X1_3210  = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
+                        PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
+
+                        X1_3210  = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
+                        PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
+
+                        _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
+                        _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
+                    }
+
+                    for( ; k <= n; k++ ) {
+                        CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
+                            silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) );                    /* Q( -rshift ) */
+                        CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
+                            silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
+                    }
+                }
+            }
+        }
+
+        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
+        tmp1 = C_first_row[ n ];                                                                        /* Q( -rshifts ) */
+        tmp2 = C_last_row[ n ];                                                                         /* Q( -rshifts ) */
+        num  = 0;                                                                                       /* Q( -rshifts ) */
+        nrg  = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );                                                        /* Q( 1-rshifts ) */
+        for( k = 0; k < n; k++ ) {
+            Atmp_QA = Af_QA[ k ];
+            lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
+            lz = silk_min( 32 - QA, lz );
+            Atmp1 = silk_LSHIFT32( Atmp_QA, lz );                                                       /* Q( QA + lz ) */
+
+            tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[  n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            num  = silk_ADD_LSHIFT32( num,  silk_SMMUL( CAb[ n - k ],             Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            nrg  = silk_ADD_LSHIFT32( nrg,  silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
+                                                                                Atmp1 ), 32 - QA - lz );    /* Q( 1-rshifts ) */
+        }
+        CAf[ n + 1 ] = tmp1;                                                                            /* Q( -rshifts ) */
+        CAb[ n + 1 ] = tmp2;                                                                            /* Q( -rshifts ) */
+        num = silk_ADD32( num, tmp2 );                                                                  /* Q( -rshifts ) */
+        num = silk_LSHIFT32( -num, 1 );                                                                 /* Q( 1-rshifts ) */
+
+        /* Calculate the next order reflection (parcor) coefficient */
+        if( silk_abs( num ) < nrg ) {
+            rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
+        } else {
+            rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
+        }
+
+        /* Update inverse prediction gain */
+        tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
+        tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
+        if( tmp1 <= minInvGain_Q30 ) {
+            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
+            tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 );            /* Q30 */
+            rc_Q31 = silk_SQRT_APPROX( tmp2 );                                                  /* Q15 */
+            if( rc_Q31 > 0 ) {
+                 /* Newton-Raphson iteration */
+                rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 );                   /* Q15 */
+                rc_Q31 = silk_LSHIFT32( rc_Q31, 16 );                                               /* Q31 */
+                if( num < 0 ) {
+                    /* Ensure adjusted reflection coefficients has the original sign */
+                    rc_Q31 = -rc_Q31;
+                }
+            }
+            invGain_Q30 = minInvGain_Q30;
+            reached_max_gain = 1;
+        } else {
+            invGain_Q30 = tmp1;
+        }
+
+        /* Update the AR coefficients */
+        for( k = 0; k < (n + 1) >> 1; k++ ) {
+            tmp1 = Af_QA[ k ];                                                                  /* QA */
+            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
+            Af_QA[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );      /* QA */
+            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );      /* QA */
+        }
+        Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA );                                          /* QA */
+
+        if( reached_max_gain ) {
+            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
+            for( k = n + 1; k < D; k++ ) {
+                Af_QA[ k ] = 0;
+            }
+            break;
+        }
+
+        /* Update C * Af and C * Ab */
+        for( k = 0; k <= n + 1; k++ ) {
+            tmp1 = CAf[ k ];                                                                    /* Q( -rshifts ) */
+            tmp2 = CAb[ n - k + 1 ];                                                            /* Q( -rshifts ) */
+            CAf[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );        /* Q( -rshifts ) */
+            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );        /* Q( -rshifts ) */
+        }
+    }
+
+    if( reached_max_gain ) {
+        for( k = 0; k < D; k++ ) {
+            /* Scale coefficients */
+            A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
+        }
+        /* Subtract energy of preceding samples from C0 */
+        if( rshifts > 0 ) {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+            }
+        } else {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
+            }
+        }
+        /* Approximate residual energy */
+        *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
+        *res_nrg_Q = -rshifts;
+    } else {
+        /* Return residual energy */
+        nrg  = CAf[ 0 ];                                                                            /* Q( -rshifts ) */
+        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
+        for( k = 0; k < D; k++ ) {
+            Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );                                       /* Q16 */
+            nrg  = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );                                         /* Q( -rshifts ) */
+            tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );                                               /* Q16 */
+            A_Q16[ k ] = -Atmp1;
+        }
+        *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
+        *res_nrg_Q = -rshifts;
+    }
+}
--- a/media/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/media/libopus/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@ -0,0 +1,88 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+
+#include "SigProc_FIX.h"
+#include "pitch.h"
+
+opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+    const opus_int16            *inVec1,            /*    I input vector 1                                              */
+    const opus_int16            *inVec2,            /*    I input vector 2                                              */
+    const opus_int              len                 /*    I vector lengths                                              */
+)
+{
+    opus_int  i, dataSize8;
+    opus_int64 sum;
+
+    __m128i xmm_tempa;
+    __m128i inVec1_76543210, acc1;
+    __m128i inVec2_76543210, acc2;
+
+    sum = 0;
+    dataSize8 = len & ~7;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for( i = 0; i < dataSize8; i += 8 ) {
+        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
+        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+
+        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
+        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+
+        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
+        /* equal shift right 8 bytes */
+        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
+
+        acc1 = _mm_add_epi64( acc1, xmm_tempa );
+        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+    }
+
+    acc1 = _mm_add_epi64( acc1, acc2 );
+
+    /* equal shift right 8 bytes */
+    acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+    acc1 = _mm_add_epi64( acc1, acc2 );
+
+    _mm_storel_epi64( (__m128i *)&sum, acc1 );
+
+    for( ; i < len; i++ ) {
+        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
+    }
+
+    return sum;
+}
--- a/media/libopus/silk/float/LPC_analysis_filter_FLP.c
+++ b/media/libopus/silk/float/LPC_analysis_filter_FLP.c
@ -215,7 +215,7 @@ void silk_LPC_analysis_filter_FLP(
    const opus_int                  Order                               /* I    LPC order                                   */
 )
 {
-    silk_assert( Order <= length );
+    celt_assert( Order <= length );

    switch( Order ) {
        case 6:
@ -239,7 +239,7 @@ void silk_LPC_analysis_filter_FLP(
        break;

        default:
-            silk_assert( 0 );
+            celt_assert( 0 );
        break;
    }

--- a/media/libopus/silk/float/apply_sine_window_FLP.c
+++ b/media/libopus/silk/float/apply_sine_window_FLP.c
@ -45,10 +45,10 @@ void silk_apply_sine_window_FLP(
    opus_int   k;
    silk_float freq, c, S0, S1;

-    silk_assert( win_type == 1 || win_type == 2 );
+    celt_assert( win_type == 1 || win_type == 2 );

    /* Length must be multiple of 4 */
-    silk_assert( ( length & 3 ) == 0 );
+    celt_assert( ( length & 3 ) == 0 );

    freq = PI / ( length + 1 );

--- a/media/libopus/silk/float/burg_modified_FLP.c
+++ b/media/libopus/silk/float/burg_modified_FLP.c
@ -52,7 +52,7 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
    double           CAf[ SILK_MAX_ORDER_LPC + 1 ], CAb[ SILK_MAX_ORDER_LPC + 1 ];
    double           Af[ SILK_MAX_ORDER_LPC ];

-    silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
+    celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );

    /* Compute autocorrelations, added over subframes */
    C0 = silk_energy_FLP( x, nb_subfr * subfr_length );
--- a/media/libopus/silk/float/encode_frame_FLP.c
+++ b/media/libopus/silk/float/encode_frame_FLP.c
@ -42,21 +42,28 @@ static OPUS_INLINE void silk_LBRR_encode_FLP(
 );

 void silk_encode_do_VAD_FLP(
-    silk_encoder_state_FLP          *psEnc                              /* I/O  Encoder state FLP                           */
+    silk_encoder_state_FLP          *psEnc,                             /* I/O  Encoder state FLP                           */
+    opus_int                        activity                            /* I    Decision of Opus voice activity detector    */
 )
 {
+    const opus_int activity_threshold = SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 );
+
    /****************************/
    /* Voice Activity Detection */
    /****************************/
    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch );
+    /* If Opus VAD is inactive and Silk VAD is active: lower Silk VAD to just under the threshold */
+    if( activity == VAD_NO_ACTIVITY && psEnc->sCmn.speech_activity_Q8 >= activity_threshold ) {
+        psEnc->sCmn.speech_activity_Q8 = activity_threshold - 1;
+    }

    /**************************************************/
    /* Convert speech activity into VAD and DTX flags */
    /**************************************************/
-    if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) {
+    if( psEnc->sCmn.speech_activity_Q8 < activity_threshold ) {
        psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY;
        psEnc->sCmn.noSpeechCounter++;
-        if( psEnc->sCmn.noSpeechCounter < NB_SPEECH_FRAMES_BEFORE_DTX ) {
+        if( psEnc->sCmn.noSpeechCounter <= NB_SPEECH_FRAMES_BEFORE_DTX ) {
            psEnc->sCmn.inDTX = 0;
        } else if( psEnc->sCmn.noSpeechCounter > MAX_CONSECUTIVE_DTX + NB_SPEECH_FRAMES_BEFORE_DTX ) {
            psEnc->sCmn.noSpeechCounter = NB_SPEECH_FRAMES_BEFORE_DTX;
@ -241,7 +248,7 @@ opus_int silk_encode_frame_FLP(
                if( found_lower && ( gainsID == gainsID_lower || nBits > maxBits ) ) {
                    /* Restore output state from earlier iteration that did meet the bitrate budget */
                    silk_memcpy( psRangeEnc, &sRangeEnc_copy2, sizeof( ec_enc ) );
-                    silk_assert( sRangeEnc_copy2.offs <= 1275 );
+                    celt_assert( sRangeEnc_copy2.offs <= 1275 );
                    silk_memcpy( psRangeEnc->buf, ec_buf_copy, sRangeEnc_copy2.offs );
                    silk_memcpy( &psEnc->sCmn.sNSQ, &sNSQ_copy2, sizeof( silk_nsq_state ) );
                    psEnc->sShape.LastGainIndex = LastGainIndex_copy2;
@ -271,7 +278,7 @@ opus_int silk_encode_frame_FLP(
                    gainsID_lower = gainsID;
                    /* Copy part of the output state */
                    silk_memcpy( &sRangeEnc_copy2, psRangeEnc, sizeof( ec_enc ) );
-                    silk_assert( psRangeEnc->offs <= 1275 );
+                    celt_assert( psRangeEnc->offs <= 1275 );
                    silk_memcpy( ec_buf_copy, psRangeEnc->buf, psRangeEnc->offs );
                    silk_memcpy( &sNSQ_copy2, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) );
                    LastGainIndex_copy2 = psEnc->sShape.LastGainIndex;
--- a/media/libopus/silk/float/find_LPC_FLP.c
+++ b/media/libopus/silk/float/find_LPC_FLP.c
@ -99,6 +99,6 @@ void silk_find_LPC_FLP(
        silk_A2NLSF_FLP( NLSF_Q15, a, psEncC->predictLPCOrder );
    }

-    silk_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 ||
+    celt_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 ||
        ( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) );
 }
--- a/media/libopus/silk/float/find_pitch_lags_FLP.c
+++ b/media/libopus/silk/float/find_pitch_lags_FLP.c
@ -56,7 +56,7 @@ void silk_find_pitch_lags_FLP(
    buf_len = psEnc->sCmn.la_pitch + psEnc->sCmn.frame_length + psEnc->sCmn.ltp_mem_length;

    /* Safety check */
-    silk_assert( buf_len >= psEnc->sCmn.pitch_LPC_win_length );
+    celt_assert( buf_len >= psEnc->sCmn.pitch_LPC_win_length );

    x_buf = x - psEnc->sCmn.ltp_mem_length;

--- a/media/libopus/silk/float/find_pred_coefs_FLP.c
+++ b/media/libopus/silk/float/find_pred_coefs_FLP.c
@ -59,7 +59,7 @@ void silk_find_pred_coefs_FLP(
        /**********/
        /* VOICED */
        /**********/
-        silk_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );
+        celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );

        /* LTP analysis */
        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr );
--- a/media/libopus/silk/float/main_FLP.h
+++ b/media/libopus/silk/float/main_FLP.h
@ -56,7 +56,8 @@ void silk_HP_variable_cutoff(

 /* Encoder main function */
 void silk_encode_do_VAD_FLP(
-    silk_encoder_state_FLP          *psEnc                              /* I/O  Encoder state FLP                           */
+    silk_encoder_state_FLP          *psEnc,                             /* I/O  Encoder state FLP                           */
+    opus_int                        activity                            /* I    Decision of Opus voice activity detector    */
 );

 /* Encoder main function */
--- a/media/libopus/silk/float/pitch_analysis_core_FLP.c
+++ b/media/libopus/silk/float/pitch_analysis_core_FLP.c
@ -109,11 +109,11 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
    const opus_int8 *Lag_CB_ptr;

    /* Check for valid sampling frequency */
-    silk_assert( Fs_kHz == 8 || Fs_kHz == 12 || Fs_kHz == 16 );
+    celt_assert( Fs_kHz == 8 || Fs_kHz == 12 || Fs_kHz == 16 );

    /* Check for valid complexity setting */
-    silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
-    silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
+    celt_assert( complexity >= SILK_PE_MIN_COMPLEX );
+    celt_assert( complexity <= SILK_PE_MAX_COMPLEX );

    silk_assert( search_thres1 >= 0.0f && search_thres1 <= 1.0f );
    silk_assert( search_thres2 >= 0.0f && search_thres2 <= 1.0f );
@ -148,7 +148,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
        silk_resampler_down2_3( filt_state, frame_8_FIX, frame_12_FIX, frame_length );
        silk_short2float_array( frame_8kHz, frame_8_FIX, frame_length_8kHz );
    } else {
-        silk_assert( Fs_kHz == 8 );
+        celt_assert( Fs_kHz == 8 );
        silk_float2short_array( frame_8_FIX, frame, frame_length_8kHz );
    }

@ -169,14 +169,14 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
    target_ptr = &frame_4kHz[ silk_LSHIFT( sf_length_4kHz, 2 ) ];
    for( k = 0; k < nb_subfr >> 1; k++ ) {
        /* Check that we are within range of the array */
-        silk_assert( target_ptr >= frame_4kHz );
-        silk_assert( target_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
+        celt_assert( target_ptr >= frame_4kHz );
+        celt_assert( target_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );

        basis_ptr = target_ptr - min_lag_4kHz;

        /* Check that we are within range of the array */
-        silk_assert( basis_ptr >= frame_4kHz );
-        silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
+        celt_assert( basis_ptr >= frame_4kHz );
+        celt_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );

        celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1, arch );

@ -215,7 +215,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,

    /* Sort */
    length_d_srch = 4 + 2 * complexity;
-    silk_assert( 3 * length_d_srch <= PE_D_SRCH_LENGTH );
+    celt_assert( 3 * length_d_srch <= PE_D_SRCH_LENGTH );
    silk_insertion_sort_decreasing_FLP( &C[ 0 ][ min_lag_4kHz ], d_srch, max_lag_4kHz - min_lag_4kHz + 1, length_d_srch );

    /* Escape if correlation is very low already here */
@ -238,7 +238,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
            break;
        }
    }
-    silk_assert( length_d_srch > 0 );
+    celt_assert( length_d_srch > 0 );

    for( i = min_lag_8kHz - 5; i < max_lag_8kHz + 5; i++ ) {
        d_comp[ i ] = 0;
@ -471,7 +471,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
        *lagIndex = (opus_int16)( lag - min_lag_8kHz );
        *contourIndex = (opus_int8)CBimax;
    }
-    silk_assert( *lagIndex >= 0 );
+    celt_assert( *lagIndex >= 0 );
    /* return as voiced */
    return 0;
 }
@ -506,8 +506,8 @@ static void silk_P_Ana_calc_corr_st3(
    opus_val32 xcorr[ SCRATCH_SIZE ];
    const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;

-    silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
-    silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
+    celt_assert( complexity >= SILK_PE_MIN_COMPLEX );
+    celt_assert( complexity <= SILK_PE_MAX_COMPLEX );

    if( nb_subfr == PE_MAX_NB_SUBFR ) {
        Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ];
@ -515,7 +515,7 @@ static void silk_P_Ana_calc_corr_st3(
        nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ];
        cbk_size      = PE_NB_CBKS_STAGE3_MAX;
    } else {
-        silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
+        celt_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
        Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ];
        Lag_CB_ptr    = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ];
        nb_cbk_search = PE_NB_CBKS_STAGE3_10MS;
@ -572,8 +572,8 @@ static void silk_P_Ana_calc_energy_st3(
    silk_float scratch_mem[ SCRATCH_SIZE ];
    const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;

-    silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
-    silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
+    celt_assert( complexity >= SILK_PE_MIN_COMPLEX );
+    celt_assert( complexity <= SILK_PE_MAX_COMPLEX );

    if( nb_subfr == PE_MAX_NB_SUBFR ) {
        Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ];
@ -581,7 +581,7 @@ static void silk_P_Ana_calc_energy_st3(
        nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ];
        cbk_size      = PE_NB_CBKS_STAGE3_MAX;
    } else {
-        silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
+        celt_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1);
        Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ];
        Lag_CB_ptr    = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ];
        nb_cbk_search = PE_NB_CBKS_STAGE3_10MS;
--- a/media/libopus/silk/float/residual_energy_FLP.c
+++ b/media/libopus/silk/float/residual_energy_FLP.c
@ -47,7 +47,7 @@ silk_float silk_residual_energy_covar_FLP(                              /* O
    silk_float tmp, nrg = 0.0f, regularization;

    /* Safety checks */
-    silk_assert( D >= 0 );
+    celt_assert( D >= 0 );

    regularization = REGULARIZATION_FACTOR * ( wXX[ 0 ] + wXX[ D * D - 1 ] );
    for( k = 0; k < MAX_ITERATIONS_RESIDUAL_NRG; k++ ) {
--- a/media/libopus/silk/float/schur_FLP.c
+++ b/media/libopus/silk/float/schur_FLP.c
@ -41,7 +41,7 @@ silk_float silk_schur_FLP(                  /* O    returns residual energy
    double C[ SILK_MAX_ORDER_LPC + 1 ][ 2 ];
    double Ctmp1, Ctmp2, rc_tmp;

-    silk_assert( order >= 0 && order <= SILK_MAX_ORDER_LPC );
+    celt_assert( order >= 0 && order <= SILK_MAX_ORDER_LPC );

    /* Copy correlations */
    k = 0;
--- a/media/libopus/silk/float/sort_FLP.c
+++ b/media/libopus/silk/float/sort_FLP.c
@ -47,9 +47,9 @@ void silk_insertion_sort_decreasing_FLP(
    opus_int   i, j;

    /* Safety checks */
-    silk_assert( K >  0 );
-    silk_assert( L >  0 );
-    silk_assert( L >= K );
+    celt_assert( K >  0 );
+    celt_assert( L >  0 );
+    celt_assert( L >= K );

    /* Write start indices in index vector */
    for( i = 0; i < K; i++ ) {
--- a/media/libopus/silk/float/warped_autocorrelation_FLP.c
+++ b/media/libopus/silk/float/warped_autocorrelation_FLP.c
@ -46,7 +46,7 @@ void silk_warped_autocorrelation_FLP(
    double      C[     MAX_SHAPE_LPC_ORDER + 1 ] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };

    /* Order must be even */
-    silk_assert( ( order & 1 ) == 0 );
+    celt_assert( ( order & 1 ) == 0 );

    /* Loop over samples */
    for( n = 0; n < length; n++ ) {
--- a/media/libopus/silk/interpolate.c
+++ b/media/libopus/silk/interpolate.c
@ -42,8 +42,8 @@ void silk_interpolate(
 {
    opus_int i;

-    silk_assert( ifact_Q2 >= 0 );
-    silk_assert( ifact_Q2 <= 4 );
+    celt_assert( ifact_Q2 >= 0 );
+    celt_assert( ifact_Q2 <= 4 );

    for( i = 0; i < d; i++ ) {
        xi[ i ] = (opus_int16)silk_ADD_RSHIFT( x0[ i ], silk_SMULBB( x1[ i ] - x0[ i ], ifact_Q2 ), 2 );
--- a/media/libopus/silk/process_NLSFs.c
+++ b/media/libopus/silk/process_NLSFs.c
@ -48,7 +48,7 @@ void silk_process_NLSFs(

    silk_assert( psEncC->speech_activity_Q8 >=   0 );
    silk_assert( psEncC->speech_activity_Q8 <= SILK_FIX_CONST( 1.0, 8 ) );
-    silk_assert( psEncC->useInterpolatedNLSFs == 1 || psEncC->indices.NLSFInterpCoef_Q2 == ( 1 << 2 ) );
+    celt_assert( psEncC->useInterpolatedNLSFs == 1 || psEncC->indices.NLSFInterpCoef_Q2 == ( 1 << 2 ) );

    /***********************/
    /* Calculate mu values */
@ -60,7 +60,7 @@ void silk_process_NLSFs(
        NLSF_mu_Q20 = silk_ADD_RSHIFT( NLSF_mu_Q20, NLSF_mu_Q20, 1 );
    }

-    silk_assert( NLSF_mu_Q20 >  0 );
+    celt_assert( NLSF_mu_Q20 >  0 );
    silk_assert( NLSF_mu_Q20 <= SILK_FIX_CONST( 0.005, 20 ) );

    /* Calculate NLSF weights */
@ -101,7 +101,7 @@ void silk_process_NLSFs(

    } else {
        /* Copy LPC coefficients for first half from second half */
-        silk_assert( psEncC->predictLPCOrder <= MAX_LPC_ORDER );
+        celt_assert( psEncC->predictLPCOrder <= MAX_LPC_ORDER );
        silk_memcpy( PredCoef_Q12[ 0 ], PredCoef_Q12[ 1 ], psEncC->predictLPCOrder * sizeof( opus_int16 ) );
    }
 }
--- a/media/libopus/silk/resampler.c
+++ b/media/libopus/silk/resampler.c
@ -91,14 +91,14 @@ opus_int silk_resampler_init(
    if( forEnc ) {
        if( ( Fs_Hz_in  != 8000 && Fs_Hz_in  != 12000 && Fs_Hz_in  != 16000 && Fs_Hz_in  != 24000 && Fs_Hz_in  != 48000 ) ||
            ( Fs_Hz_out != 8000 && Fs_Hz_out != 12000 && Fs_Hz_out != 16000 ) ) {
-            silk_assert( 0 );
+            celt_assert( 0 );
            return -1;
        }
        S->inputDelay = delay_matrix_enc[ rateID( Fs_Hz_in ) ][ rateID( Fs_Hz_out ) ];
    } else {
        if( ( Fs_Hz_in  != 8000 && Fs_Hz_in  != 12000 && Fs_Hz_in  != 16000 ) ||
            ( Fs_Hz_out != 8000 && Fs_Hz_out != 12000 && Fs_Hz_out != 16000 && Fs_Hz_out != 24000 && Fs_Hz_out != 48000 ) ) {
-            silk_assert( 0 );
+            celt_assert( 0 );
            return -1;
        }
        S->inputDelay = delay_matrix_dec[ rateID( Fs_Hz_in ) ][ rateID( Fs_Hz_out ) ];
@ -151,7 +151,7 @@ opus_int silk_resampler_init(
            S->Coefs = silk_Resampler_1_6_COEFS;
        } else {
            /* None available */
-            silk_assert( 0 );
+            celt_assert( 0 );
            return -1;
        }
    } else {
@ -181,9 +181,9 @@ opus_int silk_resampler(
    opus_int nSamples;

    /* Need at least 1 ms of input data */
-    silk_assert( inLen >= S->Fs_in_kHz );
+    celt_assert( inLen >= S->Fs_in_kHz );
    /* Delay can't exceed the 1 ms of buffering */
-    silk_assert( S->inputDelay <= S->Fs_in_kHz );
+    celt_assert( S->inputDelay <= S->Fs_in_kHz );

    nSamples = S->Fs_in_kHz - S->inputDelay;

--- a/media/libopus/silk/resampler_down2.c
+++ b/media/libopus/silk/resampler_down2.c
@ -43,8 +43,8 @@ void silk_resampler_down2(
    opus_int32 k, len2 = silk_RSHIFT32( inLen, 1 );
    opus_int32 in32, out32, Y, X;

-    silk_assert( silk_resampler_down2_0 > 0 );
-    silk_assert( silk_resampler_down2_1 < 0 );
+    celt_assert( silk_resampler_down2_0 > 0 );
+    celt_assert( silk_resampler_down2_1 < 0 );

    /* Internal variables and state are in Q10 format */
    for( k = 0; k < len2; k++ ) {
--- a/media/libopus/silk/resampler_private_down_FIR.c
+++ b/media/libopus/silk/resampler_private_down_FIR.c
@ -136,7 +136,7 @@ static OPUS_INLINE opus_int16 *silk_resampler_private_down_FIR_INTERPOL(
            }
            break;
        default:
-            silk_assert( 0 );
+            celt_assert( 0 );
    }
    return out;
 }
--- a/media/libopus/silk/sort.c
+++ b/media/libopus/silk/sort.c
@ -48,9 +48,9 @@ void silk_insertion_sort_increasing(
    opus_int        i, j;

    /* Safety checks */
-    silk_assert( K >  0 );
-    silk_assert( L >  0 );
-    silk_assert( L >= K );
+    celt_assert( K >  0 );
+    celt_assert( L >  0 );
+    celt_assert( L >= K );

    /* Write start indices in index vector */
    for( i = 0; i < K; i++ ) {
@ -96,9 +96,9 @@ void silk_insertion_sort_decreasing_int16(
    opus_int value;

    /* Safety checks */
-    silk_assert( K >  0 );
-    silk_assert( L >  0 );
-    silk_assert( L >= K );
+    celt_assert( K >  0 );
+    celt_assert( L >  0 );
+    celt_assert( L >= K );

    /* Write start indices in index vector */
    for( i = 0; i < K; i++ ) {
@ -141,7 +141,7 @@ void silk_insertion_sort_increasing_all_values_int16(
    opus_int    i, j;

    /* Safety checks */
-    silk_assert( L >  0 );
+    celt_assert( L >  0 );

    /* Sort vector elements by value, increasing order */
    for( i = 1; i < L; i++ ) {
--- a/media/libopus/silk/stereo_LR_to_MS.c
+++ b/media/libopus/silk/stereo_LR_to_MS.c
@ -109,7 +109,7 @@ void silk_stereo_LR_to_MS(
    if( total_rate_bps < 1 ) {
        total_rate_bps = 1;
    }
-    min_mid_rate_bps = silk_SMLABB( 2000, fs_kHz, 900 );
+    min_mid_rate_bps = silk_SMLABB( 2000, fs_kHz, 600 );
    silk_assert( min_mid_rate_bps < 32767 );
    /* Default bitrate distribution: 8 parts for Mid and (5+3*frac) parts for Side. so: mid_rate = ( 8 / ( 13 + 3 * frac ) ) * total_ rate */
    frac_3_Q16 = silk_MUL( 3, frac_Q16 );
--- a/media/libopus/silk/stereo_encode_pred.c
+++ b/media/libopus/silk/stereo_encode_pred.c
@ -41,11 +41,11 @@ void silk_stereo_encode_pred(

    /* Entropy coding */
    n = 5 * ix[ 0 ][ 2 ] + ix[ 1 ][ 2 ];
-    silk_assert( n < 25 );
+    celt_assert( n < 25 );
    ec_enc_icdf( psRangeEnc, n, silk_stereo_pred_joint_iCDF, 8 );
    for( n = 0; n < 2; n++ ) {
-        silk_assert( ix[ n ][ 0 ] < 3 );
-        silk_assert( ix[ n ][ 1 ] < STEREO_QUANT_SUB_STEPS );
+        celt_assert( ix[ n ][ 0 ] < 3 );
+        celt_assert( ix[ n ][ 1 ] < STEREO_QUANT_SUB_STEPS );
        ec_enc_icdf( psRangeEnc, ix[ n ][ 0 ], silk_uniform3_iCDF, 8 );
        ec_enc_icdf( psRangeEnc, ix[ n ][ 1 ], silk_uniform5_iCDF, 8 );
    }
--- a/media/libopus/silk/structs.h
+++ b/media/libopus/silk/structs.h
@ -78,6 +78,7 @@ typedef struct {
    opus_int32                   In_LP_State[ 2 ];           /* Low pass filter state */
    opus_int32                   transition_frame_no;        /* Counter which is mapped to a cut-off frequency */
    opus_int                     mode;                       /* Operating mode, <0: switch down, >0: switch up; 0: do nothing           */
+    opus_int32                   saved_fs_kHz;               /* If non-zero, holds the last sampling rate before a bandwidth switching reset. */
 } silk_LP_state;

 /* Structure containing NLSF codebook */
--- a/media/libopus/silk/tables.h
+++ b/media/libopus/silk/tables.h
@ -97,12 +97,6 @@ extern const opus_uint8  silk_NLSF_interpolation_factor_iCDF[ 5 ];
 extern const silk_NLSF_CB_struct silk_NLSF_CB_WB;                                                   /* 1040 */
 extern const silk_NLSF_CB_struct silk_NLSF_CB_NB_MB;                                                /* 728 */

-/* Piece-wise linear mapping from bitrate in kbps to coding quality in dB SNR */
-extern const opus_int32  silk_TargetRate_table_NB[  TARGET_RATE_TAB_SZ ];                           /*  32 */
-extern const opus_int32  silk_TargetRate_table_MB[  TARGET_RATE_TAB_SZ ];                           /*  32 */
-extern const opus_int32  silk_TargetRate_table_WB[  TARGET_RATE_TAB_SZ ];                           /*  32 */
-extern const opus_int16  silk_SNR_table_Q1[         TARGET_RATE_TAB_SZ ];                           /*  32 */
-
 /* Quantization offsets */
 extern const opus_int16  silk_Quantization_Offsets_Q10[ 2 ][ 2 ];                                   /*   8 */

--- a/media/libopus/silk/tables_other.c
+++ b/media/libopus/silk/tables_other.c
@ -38,20 +38,6 @@ extern "C"
 {
 #endif

-/* Piece-wise linear mapping from bitrate in kbps to coding quality in dB SNR */
-const opus_int32 silk_TargetRate_table_NB[ TARGET_RATE_TAB_SZ ] = {
-    0,      8000,   9400,   11500,  13500,  17500,  25000,  MAX_TARGET_RATE_BPS
-};
-const opus_int32 silk_TargetRate_table_MB[ TARGET_RATE_TAB_SZ ] = {
-    0,      9000,   12000,  14500,  18500,  24500,  35500,  MAX_TARGET_RATE_BPS
-};
-const opus_int32 silk_TargetRate_table_WB[ TARGET_RATE_TAB_SZ ] = {
-    0,      10500,  14000,  17000,  21500,  28500,  42000,  MAX_TARGET_RATE_BPS
-};
-const opus_int16 silk_SNR_table_Q1[ TARGET_RATE_TAB_SZ ] = {
-    18,     29,     38,     40,     46,     52,     62,     84
-};
-
 /* Tables for stereo predictor coding */
 const opus_int16 silk_stereo_pred_quant_Q13[ STEREO_QUANT_TAB_SIZE ] = {
    -13732, -10050, -8266, -7526, -6500, -5000, -2950,  -820,
--- a/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/media/libopus/silk/x86/NSQ_del_dec_sse4_1.c
@ -0,0 +1,859 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+
+#include "stack_alloc.h"
+
+typedef struct {
+    opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
+    opus_int32 RandState[ DECISION_DELAY ];
+    opus_int32 Q_Q10[     DECISION_DELAY ];
+    opus_int32 Xq_Q14[    DECISION_DELAY ];
+    opus_int32 Pred_Q15[  DECISION_DELAY ];
+    opus_int32 Shape_Q14[ DECISION_DELAY ];
+    opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
+    opus_int32 LF_AR_Q14;
+    opus_int32 Seed;
+    opus_int32 SeedInit;
+    opus_int32 RD_Q10;
+} NSQ_del_dec_struct;
+
+typedef struct {
+    opus_int32 Q_Q10;
+    opus_int32 RD_Q10;
+    opus_int32 xq_Q14;
+    opus_int32 LF_AR_Q14;
+    opus_int32 sLTP_shp_Q14;
+    opus_int32 LPC_exc_Q14;
+} NSQ_sample_struct;
+
+typedef NSQ_sample_struct  NSQ_sample_pair[ 2 ];
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
+    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
+    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
+    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
+    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
+    opus_int            subfr,                      /* I    Subframe number                     */
+    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
+    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
+    const opus_int      signal_type,                /* I    Signal type                         */
+    const opus_int      decisionDelay               /* I    Decision delay                      */
+);
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
+    opus_int            signalType,             /* I    Signal type                         */
+    const opus_int32    x_Q10[],                /* I                                        */
+    opus_int8           pulses[],               /* O                                        */
+    opus_int16          xq[],                   /* O                                        */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
+    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
+    opus_int            lag,                    /* I    Pitch lag                           */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
+    opus_int32          LF_shp_Q14,             /* I                                        */
+    opus_int32          Gain_Q16,               /* I                                        */
+    opus_int            Lambda_Q10,             /* I                                        */
+    opus_int            offset_Q10,             /* I                                        */
+    opus_int            length,                 /* I    Input length                        */
+    opus_int            subfr,                  /* I    Subframe number                     */
+    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
+    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
+    opus_int            warping_Q16,            /* I                                        */
+    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
+    opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
+    opus_int            decisionDelay           /* I                                        */
+);
+
+void silk_NSQ_del_dec_sse4_1(
+    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+)
+{
+    opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
+    opus_int            last_smple_idx, smpl_buf_idx, decisionDelay;
+    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
+    opus_int16          *pxq;
+    VARDECL( opus_int32, sLTP_Q15 );
+    VARDECL( opus_int16, sLTP );
+    opus_int32          HarmShapeFIRPacked_Q14;
+    opus_int            offset_Q10;
+    opus_int32          RDmin_Q10, Gain_Q10;
+    VARDECL( opus_int32, x_sc_Q10 );
+    VARDECL( opus_int32, delayedGain_Q10 );
+    VARDECL( NSQ_del_dec_struct, psDelDec );
+    NSQ_del_dec_struct  *psDD;
+    SAVE_STACK;
+
+    /* Set unvoiced lag to the previous one, overwrite later for voiced */
+    lag = NSQ->lagPrev;
+
+    silk_assert( NSQ->prev_gain_Q16 != 0 );
+
+    /* Initialize delayed decision states */
+    ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
+    silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
+    for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
+        psDD                 = &psDelDec[ k ];
+        psDD->Seed           = ( k + psIndices->Seed ) & 3;
+        psDD->SeedInit       = psDD->Seed;
+        psDD->RD_Q10         = 0;
+        psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
+        psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
+        silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+        silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
+    }
+
+    offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
+    smpl_buf_idx = 0; /* index of oldest samples */
+
+    decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
+
+    /* For voiced frames limit the decision delay to lower than the pitch lag */
+    if( psIndices->signalType == TYPE_VOICED ) {
+        for( k = 0; k < psEncC->nb_subfr; k++ ) {
+            decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
+        }
+    } else {
+        if( lag > 0 ) {
+            decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
+        }
+    }
+
+    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
+        LSF_interpolation_flag = 0;
+    } else {
+        LSF_interpolation_flag = 1;
+    }
+
+    ALLOC( sLTP_Q15,
+           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
+    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
+    ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
+    /* Set up pointers to start of sub frame */
+    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
+    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
+    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
+    subfr = 0;
+    for( k = 0; k < psEncC->nb_subfr; k++ ) {
+        A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
+        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
+        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+
+        /* Noise shape parameters */
+        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
+        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
+        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
+
+        NSQ->rewhite_flag = 0;
+        if( psIndices->signalType == TYPE_VOICED ) {
+            /* Voiced */
+            lag = pitchL[ k ];
+
+            /* Re-whitening */
+            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
+                if( k == 2 ) {
+                    /* RESET DELAYED DECISIONS */
+                    /* Find winner */
+                    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
+                    Winner_ind = 0;
+                    for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
+                        if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
+                            RDmin_Q10 = psDelDec[ i ].RD_Q10;
+                            Winner_ind = i;
+                        }
+                    }
+                    for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
+                        if( i != Winner_ind ) {
+                            psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
+                            silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
+                        }
+                    }
+
+                    /* Copy final part of signals from winner state to output and long-term filter states */
+                    psDD = &psDelDec[ Winner_ind ];
+                    last_smple_idx = smpl_buf_idx + decisionDelay;
+                    for( i = 0; i < decisionDelay; i++ ) {
+                        last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
+                        if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+                        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
+                        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
+                            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
+                        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
+                    }
+
+                    subfr = 0;
+                }
+
+                /* Rewhiten with new A coefs */
+                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
+                celt_assert( start_idx > 0 );
+
+                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
+                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
+
+                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+                NSQ->rewhite_flag = 1;
+            }
+        }
+
+        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+            psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
+
+        silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
+            delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
+            Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
+            psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
+
+        x_Q3   += psEncC->subfr_length;
+        pulses += psEncC->subfr_length;
+        pxq    += psEncC->subfr_length;
+    }
+
+    /* Find winner */
+    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
+    Winner_ind = 0;
+    for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
+        if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
+            RDmin_Q10 = psDelDec[ k ].RD_Q10;
+            Winner_ind = k;
+        }
+    }
+
+    /* Copy final part of signals from winner state to output and long-term filter states */
+    psDD = &psDelDec[ Winner_ind ];
+    psIndices->Seed = psDD->SeedInit;
+    last_smple_idx = smpl_buf_idx + decisionDelay;
+    Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
+    for( i = 0; i < decisionDelay; i++ ) {
+        last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
+        if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
+        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
+            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
+        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
+    }
+    silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+    silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
+
+    /* Update states */
+    NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+    NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
+
+    /* Save quantized speech signal */
+    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
+    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+    RESTORE_STACK;
+}
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
+    opus_int            signalType,             /* I    Signal type                         */
+    const opus_int32    x_Q10[],                /* I                                        */
+    opus_int8           pulses[],               /* O                                        */
+    opus_int16          xq[],                   /* O                                        */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
+    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
+    opus_int            lag,                    /* I    Pitch lag                           */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
+    opus_int32          LF_shp_Q14,             /* I                                        */
+    opus_int32          Gain_Q16,               /* I                                        */
+    opus_int            Lambda_Q10,             /* I                                        */
+    opus_int            offset_Q10,             /* I                                        */
+    opus_int            length,                 /* I    Input length                        */
+    opus_int            subfr,                  /* I    Subframe number                     */
+    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
+    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
+    opus_int            warping_Q16,            /* I                                        */
+    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
+    opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
+    opus_int            decisionDelay           /* I                                        */
+)
+{
+    opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
+    opus_int32   Winner_rand_state;
+    opus_int32   LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
+    opus_int32   n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
+    opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
+    opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+    VARDECL( NSQ_sample_pair, psSampleState );
+    NSQ_del_dec_struct *psDD;
+    NSQ_sample_struct  *psSS;
+
+    __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
+    __m128i b_Q12_0123, b_sr_Q12_0123;
+    SAVE_STACK;
+
+    celt_assert( nStatesDelayedDecision > 0 );
+    ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
+
+    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
+    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
+    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
+
+    a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
+    a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
+
+    if( opus_likely( predictLPCOrder == 16 ) ) {
+        a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
+        a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
+    }
+
+    if( signalType == TYPE_VOICED ){
+        b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
+        b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+    }
+    for( i = 0; i < length; i++ ) {
+        /* Perform common calculations used in all states */
+
+        /* Long-term prediction */
+        if( signalType == TYPE_VOICED ) {
+            /* Unrolled loop */
+            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+            LTP_pred_Q14 = 2;
+            {
+                __m128i tmpa, tmpb, pred_lag_ptr_tmp;
+                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                pred_lag_ptr_tmp    = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
+                tmpa                = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
+                tmpa                = _mm_srli_si128( tmpa, 2 );
+
+                pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
+                pred_lag_ptr_tmp    = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
+                pred_lag_ptr_tmp    = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
+                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
+
+                tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
+                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
+                LTP_pred_Q14        += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
+
+                LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
+                LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
+                pred_lag_ptr++;
+            }
+        } else {
+            LTP_pred_Q14 = 0;
+        }
+
+        /* Long-term shaping */
+        if( lag > 0 ) {
+            /* Symmetric, packed FIR coefficients */
+            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
+            shp_lag_ptr++;
+        } else {
+            n_LTP_Q14 = 0;
+        }
+        {
+            __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
+
+            for( k = 0; k < nStatesDelayedDecision; k++ ) {
+                /* Delayed decision state */
+                psDD = &psDelDec[ k ];
+
+                /* Sample state */
+                psSS = psSampleState[ k ];
+
+                /* Generate dither */
+                psDD->Seed = silk_RAND( psDD->Seed );
+
+                /* Pointer used in short term prediction and shaping */
+                psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
+                /* Short-term prediction */
+                silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
+                /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+                LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
+
+                tmpb = _mm_setzero_si128();
+
+                /* step 1 */
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
+                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );      /* 0, -1, -2, -3 */
+                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );    /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
+
+                tmpa            = _mm_srli_epi64( tmpa, 16 );
+                tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
+                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                /* step 2 */
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
+                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
+                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
+                tmpa            = _mm_srli_epi64( tmpa, 16 );
+                tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
+                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                if ( opus_likely( predictLPCOrder == 16 ) )
+                {
+                    /* step 3 */
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
+                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
+                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
+                    tmpa            = _mm_srli_epi64( tmpa, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
+                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
+                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                    /* setp 4 */
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
+                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
+                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
+                    tmpa            = _mm_srli_epi64( tmpa, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
+                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                    /* add at last */
+                    /* equal shift right 8 bytes*/
+                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
+                }
+                else
+                {
+                    /* add at last */
+                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
+
+                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
+                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
+                }
+
+                LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
+
+                /* Noise shape feedback */
+                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+                /* Output of lowpass section */
+                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+                /* Output of allpass section */
+                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
+                psDD->sAR2_Q14[ 0 ] = tmp2;
+                n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
+                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
+                /* Loop over allpass sections */
+                for( j = 2; j < shapingLPCOrder; j += 2 ) {
+                    /* Output of allpass section */
+                    tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
+                    psDD->sAR2_Q14[ j - 1 ] = tmp1;
+                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
+                    /* Output of allpass section */
+                    tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
+                    psDD->sAR2_Q14[ j + 0 ] = tmp2;
+                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
+                }
+                psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
+                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
+
+                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 );                                      /* Q11 -> Q12 */
+                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 );              /* Q12 */
+                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 );                                      /* Q12 -> Q14 */
+
+                n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 );     /* Q12 */
+                n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 );            /* Q12 */
+                n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 );                                      /* Q12 -> Q14 */
+
+                /* Input minus prediction plus noise feedback                       */
+                /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
+                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
+                tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
+                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+                tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
+
+                r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
+
+                /* Flip sign depending on dither */
+                if ( psDD->Seed < 0 ) {
+                    r_Q10 = -r_Q10;
+                }
+                r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
+
+                /* Find two quantization level candidates and measure their rate-distortion */
+                q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
+                q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+                if( q1_Q0 > 0 ) {
+                    q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
+                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
+                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
+                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
+                } else if( q1_Q0 == 0 ) {
+                    q1_Q10  = offset_Q10;
+                    q2_Q10  = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
+                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
+                } else if( q1_Q0 == -1 ) {
+                    q2_Q10  = offset_Q10;
+                    q1_Q10  = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
+                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB(  q2_Q10, Lambda_Q10 );
+                } else {            /* q1_Q0 < -1 */
+                    q1_Q10  = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
+                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
+                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
+                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
+                }
+                rr_Q10  = silk_SUB32( r_Q10, q1_Q10 );
+                rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
+                rr_Q10  = silk_SUB32( r_Q10, q2_Q10 );
+                rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
+
+                if( rd1_Q10 < rd2_Q10 ) {
+                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
+                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
+                    psSS[ 0 ].Q_Q10  = q1_Q10;
+                    psSS[ 1 ].Q_Q10  = q2_Q10;
+                } else {
+                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
+                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
+                    psSS[ 0 ].Q_Q10  = q2_Q10;
+                    psSS[ 1 ].Q_Q10  = q1_Q10;
+                }
+
+                /* Update states for best quantization */
+
+                /* Quantized excitation */
+                exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
+                if ( psDD->Seed < 0 ) {
+                    exc_Q14 = -exc_Q14;
+                }
+
+                /* Add predictions */
+                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
+                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+
+                /* Update states */
+                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
+                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
+                psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
+                psSS[ 0 ].xq_Q14       = xq_Q14;
+
+                /* Update states for second best quantization */
+
+                /* Quantized excitation */
+                exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
+                if ( psDD->Seed < 0 ) {
+                    exc_Q14 = -exc_Q14;
+                }
+
+
+                /* Add predictions */
+                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
+                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+
+                /* Update states */
+                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
+                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
+                psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
+                psSS[ 1 ].xq_Q14       = xq_Q14;
+            }
+        }
+        *smpl_buf_idx  = ( *smpl_buf_idx - 1 ) % DECISION_DELAY;
+        if( *smpl_buf_idx < 0 ) *smpl_buf_idx += DECISION_DELAY;
+        last_smple_idx = ( *smpl_buf_idx + decisionDelay ) % DECISION_DELAY;
+
+        /* Find winner */
+        RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
+        Winner_ind = 0;
+        for( k = 1; k < nStatesDelayedDecision; k++ ) {
+            if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
+                RDmin_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
+                Winner_ind = k;
+            }
+        }
+
+        /* Increase RD values of expired states */
+        Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
+        for( k = 0; k < nStatesDelayedDecision; k++ ) {
+            if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
+                psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
+                psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
+                silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
+            }
+        }
+
+        /* Find worst in first set and best in second set */
+        RDmax_Q10  = psSampleState[ 0 ][ 0 ].RD_Q10;
+        RDmin_Q10  = psSampleState[ 0 ][ 1 ].RD_Q10;
+        RDmax_ind = 0;
+        RDmin_ind = 0;
+        for( k = 1; k < nStatesDelayedDecision; k++ ) {
+            /* find worst in first set */
+            if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
+                RDmax_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
+                RDmax_ind = k;
+            }
+            /* find best in second set */
+            if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
+                RDmin_Q10  = psSampleState[ k ][ 1 ].RD_Q10;
+                RDmin_ind = k;
+            }
+        }
+
+        /* Replace a state if best from second set outperforms worst in first set */
+        if( RDmin_Q10 < RDmax_Q10 ) {
+            silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
+                         ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
+            silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
+        }
+
+        /* Write samples from winner to output and long-term filter states */
+        psDD = &psDelDec[ Winner_ind ];
+        if( subfr > 0 || i >= decisionDelay ) {
+            pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
+            xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
+                silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
+            NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
+            sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDD->Pred_Q15[  last_smple_idx ];
+        }
+        NSQ->sLTP_shp_buf_idx++;
+        NSQ->sLTP_buf_idx++;
+
+        /* Update states */
+        for( k = 0; k < nStatesDelayedDecision; k++ ) {
+            psDD                                     = &psDelDec[ k ];
+            psSS                                     = &psSampleState[ k ][ 0 ];
+            psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
+            psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
+            psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
+            psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
+            psDD->Pred_Q15[  *smpl_buf_idx ]         = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
+            psDD->Shape_Q14[ *smpl_buf_idx ]         = psSS->sLTP_shp_Q14;
+            psDD->Seed                               = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
+            psDD->RandState[ *smpl_buf_idx ]         = psDD->Seed;
+            psDD->RD_Q10                             = psSS->RD_Q10;
+        }
+        delayedGain_Q10[     *smpl_buf_idx ]         = Gain_Q10;
+    }
+    /* Update LPC states */
+    for( k = 0; k < nStatesDelayedDecision; k++ ) {
+        psDD = &psDelDec[ k ];
+        silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+    }
+    RESTORE_STACK;
+}
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
+    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
+    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
+    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
+    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
+    opus_int            subfr,                      /* I    Subframe number                     */
+    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
+    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
+    const opus_int      signal_type,                /* I    Signal type                         */
+    const opus_int      decisionDelay               /* I    Decision delay                      */
+)
+{
+    opus_int            i, k, lag;
+    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    NSQ_del_dec_struct  *psDD;
+    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+
+    lag          = pitchL[ subfr ];
+    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
+
+    silk_assert( inv_gain_Q31 != 0 );
+
+    /* Calculate gain adjustment factor */
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+    } else {
+        gain_adj_Q16 = (opus_int32)1 << 16;
+    }
+
+    /* Scale input */
+    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+
+    /* prepare inv_gain_Q23 in packed 4 32-bits */
+    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+
+    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
+        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        /* equal shift right 4 bytes*/
+        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
+        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+
+        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
+        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+
+        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+    }
+
+    for( ; i < psEncC->subfr_length; i++ ) {
+        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+    }
+
+    /* Save inverse gain */
+    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
+
+    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
+    if( NSQ->rewhite_flag ) {
+        if( subfr == 0 ) {
+            /* Do LTP downscaling */
+            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
+        }
+        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
+            silk_assert( i < MAX_FRAME_LENGTH );
+            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
+        }
+    }
+
+    /* Adjust for changing gain */
+    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+        /* Scale long-term shaping state */
+        {
+            __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
+
+            /* prepare gain_adj_Q16 in packed 4 32-bits */
+            xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
+
+            for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
+            {
+                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+                /* equal shift right 4 bytes*/
+                xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+                xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
+                xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
+
+                xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
+                xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
+
+                xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
+
+                _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+            }
+
+            for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
+                NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
+            }
+
+            /* Scale long-term prediction state */
+            if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
+                for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
+                    sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
+                }
+            }
+
+            for( k = 0; k < nStatesDelayedDecision; k++ ) {
+                psDD = &psDelDec[ k ];
+
+                /* Scale scalar states */
+                psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+
+                /* Scale short-term prediction and shaping states */
+                for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
+                    psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
+                }
+                for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
+                    psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
+                }
+                for( i = 0; i < DECISION_DELAY; i++ ) {
+                    psDD->Pred_Q15[  i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[  i ] );
+                    psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
+                }
+            }
+        }
+    }
+}
--- a/media/libopus/silk/x86/NSQ_sse4_1.c
+++ b/media/libopus/silk/x86/NSQ_sse4_1.c
@ -0,0 +1,719 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+#include "stack_alloc.h"
+
+static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
+    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
+    opus_int            subfr,                  /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,          /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
+    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
+    const opus_int      signal_type             /* I    Signal type                     */
+);
+
+static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    opus_int            signalType,             /* I    Signal type                     */
+    const opus_int32    x_sc_Q10[],             /* I                                    */
+    opus_int8           pulses[],               /* O                                    */
+    opus_int16          xq[],                   /* O                                    */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
+    opus_int            lag,                    /* I    Pitch lag                       */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
+    opus_int32          LF_shp_Q14,             /* I                                    */
+    opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            offset_Q10,             /* I                                    */
+    opus_int            length,                 /* I    Input length                    */
+    opus_int32          table[][4]              /* I                                    */
+);
+
+void silk_NSQ_sse4_1(
+    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+)
+{
+    opus_int            k, lag, start_idx, LSF_interpolation_flag;
+    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
+    opus_int16          *pxq;
+    VARDECL( opus_int32, sLTP_Q15 );
+    VARDECL( opus_int16, sLTP );
+    opus_int32          HarmShapeFIRPacked_Q14;
+    opus_int            offset_Q10;
+    VARDECL( opus_int32, x_sc_Q10 );
+
+    opus_int32   table[ 64 ][ 4 ];
+    opus_int32   tmp1;
+    opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
+
+    SAVE_STACK;
+
+    NSQ->rand_seed = psIndices->Seed;
+
+    /* Set unvoiced lag to the previous one, overwrite later for voiced */
+    lag = NSQ->lagPrev;
+
+    silk_assert( NSQ->prev_gain_Q16 != 0 );
+
+    offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
+
+    /* 0 */
+    q1_Q10  = offset_Q10;
+    q2_Q10  = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
+    rd1_Q20 = q1_Q10 * Lambda_Q10;
+    rd2_Q20 = q2_Q10 * Lambda_Q10;
+
+    table[ 32 ][ 0 ] = q1_Q10;
+    table[ 32 ][ 1 ] = q2_Q10;
+    table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+    table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+
+    /* -1 */
+    q1_Q10  = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
+    q2_Q10  = offset_Q10;
+    rd1_Q20 = - q1_Q10 * Lambda_Q10;
+    rd2_Q20 = q2_Q10 * Lambda_Q10;
+
+    table[ 31 ][ 0 ] = q1_Q10;
+    table[ 31 ][ 1 ] = q2_Q10;
+    table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+    table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+
+    /* > 0 */
+    for (k = 1; k <= 31; k++)
+    {
+        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
+
+        q1_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10;
+        q2_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
+        rd1_Q20 = q1_Q10 * Lambda_Q10;
+        rd2_Q20 = q2_Q10 * Lambda_Q10;
+
+        table[ 32 + k ][ 0 ] = q1_Q10;
+        table[ 32 + k ][ 1 ] = q2_Q10;
+        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+    }
+
+    /* < -1 */
+    for (k = -32; k <= -2; k++)
+    {
+        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
+
+        q1_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10;
+        q2_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
+        rd1_Q20 = - q1_Q10 * Lambda_Q10;
+        rd2_Q20 = - q2_Q10 * Lambda_Q10;
+
+        table[ 32 + k ][ 0 ] = q1_Q10;
+        table[ 32 + k ][ 1 ] = q2_Q10;
+        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+    }
+
+    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
+        LSF_interpolation_flag = 0;
+    } else {
+        LSF_interpolation_flag = 1;
+    }
+
+    ALLOC( sLTP_Q15,
+           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
+    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
+    /* Set up pointers to start of sub frame */
+    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
+    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
+    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
+    for( k = 0; k < psEncC->nb_subfr; k++ ) {
+        A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
+        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
+        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+
+        /* Noise shape parameters */
+        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
+        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
+        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
+
+        NSQ->rewhite_flag = 0;
+        if( psIndices->signalType == TYPE_VOICED ) {
+            /* Voiced */
+            lag = pitchL[ k ];
+
+            /* Re-whitening */
+            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
+                /* Rewhiten with new A coefs */
+                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
+                celt_assert( start_idx > 0 );
+
+                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
+                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
+
+                NSQ->rewhite_flag = 1;
+                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+            }
+        }
+
+        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+
+        if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
+        {
+            silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+                offset_Q10, psEncC->subfr_length, &(table[32]) );
+        }
+        else
+        {
+            silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
+                offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
+        }
+
+        x_Q3   += psEncC->subfr_length;
+        pulses += psEncC->subfr_length;
+        pxq    += psEncC->subfr_length;
+    }
+
+    /* Update lagPrev for next frame */
+    NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
+
+    /* Save quantized speech and noise shaping signals */
+    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
+    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+    RESTORE_STACK;
+}
+
+/***********************************/
+/* silk_noise_shape_quantizer_10_16  */
+/***********************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    opus_int            signalType,             /* I    Signal type                     */
+    const opus_int32    x_sc_Q10[],             /* I                                    */
+    opus_int8           pulses[],               /* O                                    */
+    opus_int16          xq[],                   /* O                                    */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
+    opus_int            lag,                    /* I    Pitch lag                       */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
+    opus_int32          LF_shp_Q14,             /* I                                    */
+    opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            offset_Q10,             /* I                                    */
+    opus_int            length,                 /* I    Input length                    */
+    opus_int32          table[][4]              /* I                                    */
+)
+{
+    opus_int     i;
+    opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
+    opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
+    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
+    opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
+
+    __m128i xmm_tempa, xmm_tempb;
+
+    __m128i xmm_one;
+
+    __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
+    __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
+    __m128i a_Q12_01234567,        a_Q12_89ABCDEF;
+
+    __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
+    __m128i AR_shp_Q13_76543210;
+
+    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
+    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
+    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
+
+    /* Set up short term AR state */
+    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
+
+    sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
+    xq_Q14         = psLPC_Q14[ 0 ];
+    LTP_pred_Q13   = 0;
+
+    /* load a_Q12 */
+    xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
+
+    /* load a_Q12[0] - a_Q12[7] */
+    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
+    /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
+    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
+
+    a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
+    a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
+
+    /* load AR_shp_Q13 */
+    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
+
+    /* load psLPC_Q14 */
+    xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
+
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
+
+    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
+    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
+
+    psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
+    psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
+
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
+
+    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
+    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
+
+    psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
+    psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
+
+    /* load sAR2_Q14 */
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
+
+    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
+    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
+
+    sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
+    sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
+
+    /* prepare 1 in 8 * 16bit */
+    xmm_one = _mm_set1_epi16(1);
+
+    for( i = 0; i < length; i++ )
+    {
+        /* Short-term prediction */
+        __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
+
+        /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+        LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
+
+        /* shift psLPC_Q14 */
+        psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
+        psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
+
+        psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
+        psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
+
+        psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
+        psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14),       7 );
+
+        /* high part, use pmaddwd, results in 4 32-bit */
+        xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
+        xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
+
+        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
+        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
+        xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
+
+        xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
+        xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
+
+        xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
+        xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
+
+        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
+        xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
+
+        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
+        xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
+
+        /* accumulate */
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
+        xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
+
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
+
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
+
+        LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
+
+        /* Long-term prediction */
+        if ( opus_likely( signalType == TYPE_VOICED ) ) {
+            /* Unrolled loop */
+            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+            LTP_pred_Q13 = 2;
+            {
+                __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
+
+                b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
+                b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
+
+                /* loaded: [0] [-1] [-2] [-3] */
+                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
+                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
+                /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
+                xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
+                /* right shift 2 bytes (16 bits), zero extended */
+                xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
+
+                /* a[1] * b[-1], a[3] * b[-3] */
+                pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
+                pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
+
+                pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
+                /* equal shift right 8 bytes*/
+                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+                xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
+
+                LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
+
+                LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
+                pred_lag_ptr++;
+            }
+        }
+
+        /* Noise shape feedback */
+        NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
+        NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
+
+        sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
+        sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
+
+        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
+        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
+
+        /* high part, use pmaddwd, results in 4 32-bit */
+        xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
+
+        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
+        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
+        xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
+
+        xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
+        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
+
+        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
+
+        /* accumulate */
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
+
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
+
+        n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
+
+        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
+        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
+
+        n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                                /* Q11 -> Q12 */
+        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
+
+        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
+        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
+
+        silk_assert( lag > 0 || signalType != TYPE_VOICED );
+
+        /* Combine prediction and noise shaping signals */
+        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
+        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
+        if( lag > 0 ) {
+            /* Symmetric, packed FIR coefficients */
+            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
+            shp_lag_ptr++;
+
+            tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 );                       /* Q13 */
+            tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 );                          /* Q13 */
+            tmp1 = silk_RSHIFT_ROUND( tmp1, 3 );                                /* Q10 */
+        } else {
+            tmp1 = silk_RSHIFT_ROUND( tmp1, 2 );                                /* Q10 */
+        }
+
+        r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 );                              /* residual error Q10 */
+
+        /* Generate dither */
+        NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
+
+        /* Flip sign depending on dither */
+        tmp2 = -r_Q10;
+        if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
+
+        r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
+
+        /* Find two quantization level candidates and measure their rate-distortion */
+        q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
+        q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+
+        q1_Q10 = table[q1_Q0][0];
+        q2_Q10 = table[q1_Q0][1];
+
+        if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
+        {
+            q1_Q10 = q2_Q10;
+        }
+
+        pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
+
+        /* Excitation */
+        exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
+
+        tmp2 = -exc_Q14;
+        if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
+
+        /* Add predictions */
+        LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
+        xq_Q14      = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
+
+        /* Update states */
+        psLPC_Q14++;
+        *psLPC_Q14 = xq_Q14;
+        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+
+        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
+        sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
+        NSQ->sLTP_shp_buf_idx++;
+        NSQ->sLTP_buf_idx++;
+
+        /* Make dither dependent on quantized signal */
+        NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
+    }
+
+    NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
+
+    /* Scale XQ back to normal level before saving */
+    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
+
+    /* write back sAR2_Q14 */
+    xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
+    xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
+    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
+    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
+
+    /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
+    {
+        __m128i xmm_Gain_Q10;
+        __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
+
+        /* prepare (1 << 7) in packed 4 32-bits */
+        xmm_tempa = _mm_set1_epi32( (1 << 7) );
+
+        /* prepare Gain_Q10 in packed 4 32-bits */
+        xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
+
+        /* process xq */
+        for (i = 0; i < length - 7; i += 8)
+        {
+            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
+            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
+
+            /* equal shift right 4 bytes*/
+            xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+            /* equal shift right 4 bytes*/
+            xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+            xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
+            xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
+            xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
+            xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
+
+            xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
+            xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
+            xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
+            xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
+
+            xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
+            xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
+
+            /* silk_RSHIFT_ROUND(xq, 8) */
+            xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
+            xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
+
+            xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
+            xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
+
+            /* silk_SAT16 */
+            xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
+
+            /* save to xq */
+            _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
+        }
+    }
+    for ( ; i < length; i++)
+    {
+        xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
+    }
+
+    /* Update LPC synth buffer */
+    silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+}
+
+static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
+    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
+    opus_int            subfr,                  /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,          /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
+    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
+    const opus_int      signal_type             /* I    Signal type                     */
+)
+{
+    opus_int   i, lag;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+
+    lag          = pitchL[ subfr ];
+    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
+    silk_assert( inv_gain_Q31 != 0 );
+
+    /* Calculate gain adjustment factor */
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+    } else {
+        gain_adj_Q16 = (opus_int32)1 << 16;
+    }
+
+    /* Scale input */
+    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+
+    /* prepare inv_gain_Q23 in packed 4 32-bits */
+    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+
+    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
+        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+
+        /* equal shift right 4 bytes*/
+        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
+        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+
+        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
+        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+
+        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+    }
+
+    for( ; i < psEncC->subfr_length; i++ ) {
+        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+    }
+
+    /* Save inverse gain */
+    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
+
+    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
+    if( NSQ->rewhite_flag ) {
+        if( subfr == 0 ) {
+            /* Do LTP downscaling */
+            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
+        }
+        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
+            silk_assert( i < MAX_FRAME_LENGTH );
+            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
+        }
+    }
+
+    /* Adjust for changing gain */
+    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+        /* Scale long-term shaping state */
+        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
+
+        /* prepare gain_adj_Q16 in packed 4 32-bits */
+        xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
+
+        for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
+        {
+            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+            /* equal shift right 4 bytes*/
+            xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+            xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
+            xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
+
+            xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
+            xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
+
+            xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
+
+            _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+        }
+
+        for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
+            NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
+        }
+
+        /* Scale long-term prediction state */
+        if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
+            for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
+                sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
+            }
+        }
+
+        NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+
+        /* Scale short-term prediction and shaping states */
+        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
+            NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
+        }
+        for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
+            NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
+        }
+    }
+}
--- a/media/libopus/silk/x86/VAD_sse4_1.c
+++ b/media/libopus/silk/x86/VAD_sse4_1.c
@ -0,0 +1,277 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "main.h"
+#include "stack_alloc.h"
+
+/* Weighting factors for tilt measure */
+static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
+
+/***************************************/
+/* Get the speech activity level in Q8 */
+/***************************************/
+opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if success                  */
+    silk_encoder_state          *psEncC,            /* I/O  Encoder state                               */
+    const opus_int16            pIn[]               /* I    PCM input                                   */
+)
+{
+    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
+    opus_int   decimated_framelength1, decimated_framelength2;
+    opus_int   decimated_framelength;
+    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
+    opus_int32 sumSquared, smooth_coef_Q16;
+    opus_int16 HPstateTmp;
+    VARDECL( opus_int16, X );
+    opus_int32 Xnrg[ VAD_N_BANDS ];
+    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
+    opus_int32 speech_nrg, x_tmp;
+    opus_int   X_offset[ VAD_N_BANDS ];
+    opus_int   ret = 0;
+    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
+
+    SAVE_STACK;
+
+    /* Safety checks */
+    silk_assert( VAD_N_BANDS == 4 );
+    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
+    celt_assert( psEncC->frame_length <= 512 );
+    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
+
+    /***********************/
+    /* Filter and Decimate */
+    /***********************/
+    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
+    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
+    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
+    /* Decimate into 4 bands:
+       0       L      3L       L              3L                             5L
+               -      --       -              --                             --
+               8       8       2               4                              4
+
+       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
+
+       They're arranged to allow the minimal ( frame_length / 4 ) extra
+       scratch space during the downsampling process */
+    X_offset[ 0 ] = 0;
+    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
+    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
+    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
+    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
+
+    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
+    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
+        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
+
+    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
+    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
+        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
+
+    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
+    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
+        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
+
+    /*********************************************/
+    /* HP filter on lowest band (differentiator) */
+    /*********************************************/
+    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
+    HPstateTmp = X[ decimated_framelength - 1 ];
+    for( i = decimated_framelength - 1; i > 0; i-- ) {
+        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
+        X[ i ]     -= X[ i - 1 ];
+    }
+    X[ 0 ] -= psSilk_VAD->HPstate;
+    psSilk_VAD->HPstate = HPstateTmp;
+
+    /*************************************/
+    /* Calculate the energy in each band */
+    /*************************************/
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        /* Find the decimated framelength in the non-uniformly divided bands */
+        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
+
+        /* Split length into subframe lengths */
+        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
+        dec_subframe_offset = 0;
+
+        /* Compute energy per sub-frame */
+        /* initialize with summed energy of last subframe */
+        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
+        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
+            __m128i xmm_X, xmm_acc;
+            sumSquared = 0;
+
+            xmm_acc = _mm_setzero_si128();
+
+            for( i = 0; i < dec_subframe_length - 7; i += 8 )
+            {
+                xmm_X   = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
+                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
+                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
+                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
+            }
+
+            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
+            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
+
+            sumSquared += _mm_cvtsi128_si32( xmm_acc );
+
+            for( ; i < dec_subframe_length; i++ ) {
+                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
+                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
+                x_tmp = silk_RSHIFT(
+                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
+                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
+
+                /* Safety check */
+                silk_assert( sumSquared >= 0 );
+            }
+
+            /* Add/saturate summed energy of current subframe */
+            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
+                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
+            } else {
+                /* Look-ahead subframe */
+                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
+            }
+
+            dec_subframe_offset += dec_subframe_length;
+        }
+        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
+    }
+
+    /********************/
+    /* Noise estimation */
+    /********************/
+    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
+
+    /***********************************************/
+    /* Signal-plus-noise to noise ratio estimation */
+    /***********************************************/
+    sumSquared = 0;
+    input_tilt = 0;
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
+        if( speech_nrg > 0 ) {
+            /* Divide, with sufficient resolution */
+            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
+                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
+            } else {
+                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
+            }
+
+            /* Convert to log domain */
+            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
+
+            /* Sum-of-squares */
+            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
+
+            /* Tilt measure */
+            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
+                /* Scale down SNR value for small subband speech energies */
+                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
+            }
+            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
+        } else {
+            NrgToNoiseRatio_Q8[ b ] = 256;
+        }
+    }
+
+    /* Mean-of-squares */
+    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
+
+    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
+    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
+
+    /*********************************/
+    /* Speech Probability Estimation */
+    /*********************************/
+    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
+
+    /**************************/
+    /* Frequency Tilt Measure */
+    /**************************/
+    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
+
+    /**************************************************/
+    /* Scale the sigmoid output based on power levels */
+    /**************************************************/
+    speech_nrg = 0;
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
+        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
+    }
+
+    /* Power scaling */
+    if( speech_nrg <= 0 ) {
+        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
+    } else if( speech_nrg < 32768 ) {
+        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
+            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
+        } else {
+            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
+        }
+
+        /* square-root */
+        speech_nrg = silk_SQRT_APPROX( speech_nrg );
+        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
+    }
+
+    /* Copy the resulting speech activity in Q8 */
+    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
+
+    /***********************************/
+    /* Energy Level and SNR estimation */
+    /***********************************/
+    /* Smoothing coefficient */
+    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
+
+    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
+        smooth_coef_Q16 >>= 1;
+    }
+
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        /* compute smoothed energy-to-noise ratio per band */
+        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
+            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
+
+        /* signal to noise ratio in dB per band */
+        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
+        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
+        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
+    }
+
+    RESTORE_STACK;
+    return( ret );
+}
--- a/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/media/libopus/silk/x86/VQ_WMat_EC_sse4_1.c
@ -0,0 +1,142 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+
+/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
+void silk_VQ_WMat_EC_sse4_1(
+    opus_int8                   *ind,                           /* O    index of best codebook vector               */
+    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
+    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
+    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
+    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
+    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
+    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
+    opus_int                    L                               /* I    number of vectors in codebook               */
+)
+{
+    opus_int   k, gain_tmp_Q7;
+    const opus_int8 *cb_row_Q7;
+    opus_int16 diff_Q14[ 5 ];
+    opus_int32 sum1_Q14, sum2_Q16;
+
+    __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
+    /* Loop over codebook */
+    *rate_dist_Q14 = silk_int32_MAX;
+    cb_row_Q7 = cb_Q7;
+    for( k = 0; k < L; k++ ) {
+        gain_tmp_Q7 = cb_gain_Q7[k];
+
+        diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
+
+        C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
+        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+        C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
+        C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
+
+        diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
+        diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
+        diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
+        diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
+
+        /* Weighted rate */
+        sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+
+        /* Penalty for too large gain */
+        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
+
+        silk_assert( sum1_Q14 >= 0 );
+
+        /* first row of W_Q18 */
+        C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
+        C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
+        C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
+
+        C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
+        C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
+
+        C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
+        C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
+
+        C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
+        C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
+
+        C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
+        sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
+
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  0 ], diff_Q14[ 0 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 0 ] );
+
+        /* second row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[  7 ], diff_Q14[ 2 ] );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  8 ], diff_Q14[ 3 ] );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  9 ], diff_Q14[ 4 ] );
+        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  6 ], diff_Q14[ 1 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 1 ] );
+
+        /* third row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[ 13 ], diff_Q14[ 3 ] );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
+        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 2 ] );
+
+        /* fourth row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[ 19 ], diff_Q14[ 4 ] );
+        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 3 ] );
+
+        /* last row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[ 24 ], diff_Q14[ 4 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 4 ] );
+
+        silk_assert( sum1_Q14 >= 0 );
+
+        /* find best */
+        if( sum1_Q14 < *rate_dist_Q14 ) {
+            *rate_dist_Q14 = sum1_Q14;
+            *ind = (opus_int8)k;
+            *gain_Q7 = gain_tmp_Q7;
+        }
+
+        /* Go to next cbk vector */
+        cb_row_Q7 += LTP_ORDER;
+    }
+}
--- a/media/libopus/sources.mozbuild
+++ b/media/libopus/sources.mozbuild
@ -39,7 +39,7 @@ celt_sources_sse2 = [
 ]

 celt_sources_sse4_1 = [
-    'celt/x86/celt_lpc_sse.c',
+    'celt/x86/celt_lpc_sse4_1.c',
    'celt/x86/pitch_sse4_1.c',
 ]

@ -62,17 +62,20 @@ celt_sources_arm_neon_intr = [
 ]

 celt_sources_arm_ne10 = [
-    'celt/arm/celt_ne10_fft.c',
-    'celt/arm/celt_ne10_mdct.c',
+    'celt/arm/celt_fft_ne10.c',
+    'celt/arm/celt_mdct_ne10.c',
 ]

 opus_sources = [
+    'src/mapping_matrix.c',
    'src/opus.c',
    'src/opus_decoder.c',
    'src/opus_encoder.c',
    'src/opus_multistream.c',
    'src/opus_multistream_decoder.c',
    'src/opus_multistream_encoder.c',
+    'src/opus_projection_decoder.c',
+    'src/opus_projection_encoder.c',
    'src/repacketizer.c',
 ]

@ -161,10 +164,10 @@ silk_sources = [
 ]

 silk_sources_sse4_1 = [
-    'silk/x86/NSQ_del_dec_sse.c',
-    'silk/x86/NSQ_sse.c',
-    'silk/x86/VAD_sse.c',
-    'silk/x86/VQ_WMat_EC_sse.c',
+    'silk/x86/NSQ_del_dec_sse4_1.c',
+    'silk/x86/NSQ_sse4_1.c',
+    'silk/x86/VAD_sse4_1.c',
+    'silk/x86/VQ_WMat_EC_sse4_1.c',
    'silk/x86/x86_silk_map.c',
 ]

@ -203,8 +206,8 @@ silk_sources_fixed = [
 ]

 silk_sources_fixed_sse4_1 = [
-    'silk/fixed/x86/burg_modified_FIX_sse.c',
-    'silk/fixed/x86/vector_ops_FIX_sse.c',
+    'silk/fixed/x86/burg_modified_FIX_sse4_1.c',
+    'silk/fixed/x86/vector_ops_FIX_sse4_1.c',
 ]

 silk_sources_fixed_arm_neon_intr = [
--- a/Показать больше
+++ b/Показать больше