Bug 938686 - Support Opus in WebM. r=kinetik

Support the Opus audio codec in the WebM (Matroska) container. This is part of the "WebM 2" proposed spec, which also includes the new VP9 video codec. Alas we weren't able to get concensus to change the doctype of filename extension to mark the revision allowing the new codecs.
2013-11-22 14:07:00 -08:00 · 2013-11-22 14:07:00 -08:00 · 33d36a07dc
--- a/content/media/DecoderTraits.cpp
+++ b/content/media/DecoderTraits.cpp
@ -171,10 +171,11 @@ static const char* const gWebMTypes[3] = {
  nullptr
 };

-static char const *const gWebMCodecs[4] = {
+static char const *const gWebMCodecs[5] = {
  "vp8",
  "vp8.0",
  "vorbis",
+  "opus",
  nullptr
 };

--- a/content/media/webm/WebMReader.cpp
+++ b/content/media/webm/WebMReader.cpp
@ -16,6 +16,8 @@
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"

+#include "OggReader.h"
+
 using mozilla::NesteggPacketHolder;

 template <>
@ -141,6 +143,11 @@ WebMReader::WebMReader(AbstractMediaDecoder* aDecoder)
  mContext(nullptr),
  mPacketCount(0),
  mChannels(0),
+#ifdef MOZ_OPUS
+  mOpusParser(nullptr),
+  mOpusDecoder(nullptr),
+  mSkip(0),
+#endif
  mVideoTrack(0),
  mAudioTrack(0),
  mAudioStartUsec(-1),
@ -177,6 +184,11 @@ WebMReader::~WebMReader()
  vorbis_info_clear(&mVorbisInfo);
  vorbis_comment_clear(&mVorbisComment);

+  if (mOpusDecoder) {
+    opus_multistream_decoder_destroy(mOpusDecoder);
+    mOpusDecoder = nullptr;
+  }
+
  MOZ_COUNT_DTOR(WebMReader);
 }

@ -338,51 +350,83 @@ nsresult WebMReader::ReadMetadata(MediaInfo* aInfo,
      mAudioTrack = track;
      mHasAudio = true;
      mInfo.mAudio.mHasAudio = true;
+      mAudioCodec = nestegg_track_codec_id(mContext, track);
+      mCodecDelay = params.codec_delay;

-      // Get the Vorbis header data
-      unsigned int nheaders = 0;
-      r = nestegg_track_codec_data_count(mContext, track, &nheaders);
-      if (r == -1 || nheaders != 3) {
-        Cleanup();
-        return NS_ERROR_FAILURE;
-      }
+      if (mAudioCodec == NESTEGG_CODEC_VORBIS) {
+        // Get the Vorbis header data
+        unsigned int nheaders = 0;
+        r = nestegg_track_codec_data_count(mContext, track, &nheaders);
+        if (r == -1 || nheaders != 3) {
+          Cleanup();
+          return NS_ERROR_FAILURE;
+        }

-      for (uint32_t header = 0; header < nheaders; ++header) {
+        for (uint32_t header = 0; header < nheaders; ++header) {
+          unsigned char* data = 0;
+          size_t length = 0;
+
+          r = nestegg_track_codec_data(mContext, track, header, &data, &length);
+          if (r == -1) {
+            Cleanup();
+            return NS_ERROR_FAILURE;
+          }
+          ogg_packet opacket = InitOggPacket(data, length, header == 0, false, 0);
+
+          r = vorbis_synthesis_headerin(&mVorbisInfo,
+                                        &mVorbisComment,
+                                        &opacket);
+          if (r != 0) {
+            Cleanup();
+            return NS_ERROR_FAILURE;
+          }
+        }
+
+        r = vorbis_synthesis_init(&mVorbisDsp, &mVorbisInfo);
+        if (r != 0) {
+          Cleanup();
+          return NS_ERROR_FAILURE;
+        }
+
+        r = vorbis_block_init(&mVorbisDsp, &mVorbisBlock);
+        if (r != 0) {
+          Cleanup();
+          return NS_ERROR_FAILURE;
+        }
+
+        mInfo.mAudio.mRate = mVorbisDsp.vi->rate;
+        mInfo.mAudio.mChannels = mVorbisDsp.vi->channels;
+        mChannels = mInfo.mAudio.mChannels;
+#ifdef MOZ_OPUS
+      } else if (mAudioCodec == NESTEGG_CODEC_OPUS) {
        unsigned char* data = 0;
        size_t length = 0;
-
-        r = nestegg_track_codec_data(mContext, track, header, &data, &length);
+        r = nestegg_track_codec_data(mContext, track, 0, &data, &length);
        if (r == -1) {
          Cleanup();
          return NS_ERROR_FAILURE;
        }

-        ogg_packet opacket = InitOggPacket(data, length, header == 0, false, 0);
-
-        r = vorbis_synthesis_headerin(&mVorbisInfo,
-                                      &mVorbisComment,
-                                      &opacket);
-        if (r != 0) {
+        mOpusParser = new OpusParser;
+        if (!mOpusParser->DecodeHeader(data, length)) {
          Cleanup();
          return NS_ERROR_FAILURE;
        }
-      }

-      r = vorbis_synthesis_init(&mVorbisDsp, &mVorbisInfo);
-      if (r != 0) {
+        if (!InitOpusDecoder()) {
+          Cleanup();
+          return NS_ERROR_FAILURE;
+        }
+
+        mInfo.mAudio.mRate = mOpusParser->mRate;
+
+        mInfo.mAudio.mChannels = mOpusParser->mChannels;
+        mInfo.mAudio.mChannels = mInfo.mAudio.mChannels > 2 ? 2 : mInfo.mAudio.mChannels;
+#endif
+      } else {
        Cleanup();
        return NS_ERROR_FAILURE;
      }
-
-      r = vorbis_block_init(&mVorbisDsp, &mVorbisBlock);
-      if (r != 0) {
-        Cleanup();
-        return NS_ERROR_FAILURE;
-      }
-
-      mInfo.mAudio.mRate = mVorbisDsp.vi->rate;
-      mInfo.mAudio.mChannels = mVorbisDsp.vi->channels;
-      mChannels = mInfo.mAudio.mChannels;
    }
  }

@ -396,6 +440,25 @@ nsresult WebMReader::ReadMetadata(MediaInfo* aInfo,
  return NS_OK;
 }

+#ifdef MOZ_OPUS
+bool WebMReader::InitOpusDecoder()
+{
+  int r;
+
+  NS_ASSERTION(mOpusDecoder == nullptr, "leaking OpusDecoder");
+
+  mOpusDecoder = opus_multistream_decoder_create(mOpusParser->mRate,
+                                             mOpusParser->mChannels,
+                                             mOpusParser->mStreams,
+                                             mOpusParser->mCoupledStreams,
+                                             mOpusParser->mMappingTable,
+                                             &r);
+  mSkip = mOpusParser->mPreSkip;
+
+  return r == OPUS_OK;
+}
+#endif
+
 ogg_packet WebMReader::InitOggPacket(unsigned char* aData,
                                       size_t aLength,
                                       bool aBOS,
@ -429,7 +492,7 @@ bool WebMReader::DecodeAudioPacket(nestegg_packet* aPacket, int64_t aOffset)
    return false;
  }

-  const uint32_t rate = mVorbisDsp.vi->rate;
+  const uint32_t rate = mInfo.mAudio.mRate;
  uint64_t tstamp_usecs = tstamp / NS_PER_USEC;
  if (mAudioStartUsec == -1) {
    // This is the first audio chunk. Assume the start time of our decode
@ -471,27 +534,164 @@ bool WebMReader::DecodeAudioPacket(nestegg_packet* aPacket, int64_t aOffset)
    if (r == -1) {
      return false;
    }
+    if (mAudioCodec == NESTEGG_CODEC_VORBIS) {
+      ogg_packet opacket = InitOggPacket(data, length, false, false, -1);

-    ogg_packet opacket = InitOggPacket(data, length, false, false, -1);
+      if (vorbis_synthesis(&mVorbisBlock, &opacket) != 0) {
+        return false;
+      }

-    if (vorbis_synthesis(&mVorbisBlock, &opacket) != 0) {
-      return false;
-    }
+      if (vorbis_synthesis_blockin(&mVorbisDsp,
+                                   &mVorbisBlock) != 0) {
+        return false;
+      }

-    if (vorbis_synthesis_blockin(&mVorbisDsp,
-                                 &mVorbisBlock) != 0) {
-      return false;
-    }
-
-    VorbisPCMValue** pcm = 0;
-    int32_t frames = 0;
-    while ((frames = vorbis_synthesis_pcmout(&mVorbisDsp, &pcm)) > 0) {
-      nsAutoArrayPtr<AudioDataValue> buffer(new AudioDataValue[frames * mChannels]);
-      for (uint32_t j = 0; j < mChannels; ++j) {
-        VorbisPCMValue* channel = pcm[j];
-        for (uint32_t i = 0; i < uint32_t(frames); ++i) {
-          buffer[i*mChannels + j] = MOZ_CONVERT_VORBIS_SAMPLE(channel[i]);
+      VorbisPCMValue** pcm = 0;
+      int32_t frames = 0;
+      while ((frames = vorbis_synthesis_pcmout(&mVorbisDsp, &pcm)) > 0) {
+        nsAutoArrayPtr<AudioDataValue> buffer(new AudioDataValue[frames * mChannels]);
+        for (uint32_t j = 0; j < mChannels; ++j) {
+          VorbisPCMValue* channel = pcm[j];
+          for (uint32_t i = 0; i < uint32_t(frames); ++i) {
+            buffer[i*mChannels + j] = MOZ_CONVERT_VORBIS_SAMPLE(channel[i]);
+          }
        }
+
+        CheckedInt64 duration = FramesToUsecs(frames, rate);
+        if (!duration.isValid()) {
+          NS_WARNING("Int overflow converting WebM audio duration");
+          return false;
+        }
+        CheckedInt64 total_duration = FramesToUsecs(total_frames, rate);
+        if (!total_duration.isValid()) {
+          NS_WARNING("Int overflow converting WebM audio total_duration");
+          return false;
+        }
+
+        CheckedInt64 time = total_duration + tstamp_usecs;
+        if (!time.isValid()) {
+          NS_WARNING("Int overflow adding total_duration and tstamp_usecs");
+          nestegg_free_packet(aPacket);
+          return false;
+        };
+
+        total_frames += frames;
+        AudioQueue().Push(new AudioData(aOffset,
+                                       time.value(),
+                                       duration.value(),
+                                       frames,
+                                       buffer.forget(),
+                                       mChannels));
+        mAudioFrames += frames;
+        if (vorbis_synthesis_read(&mVorbisDsp, frames) != 0) {
+          return false;
+        }
+      }
+    } else if (mAudioCodec == NESTEGG_CODEC_OPUS) {
+#ifdef MOZ_OPUS
+      uint32_t channels = mOpusParser->mChannels;
+
+      // Maximum value is 63*2880, so there's no chance of overflow.
+      int32_t frames_number = opus_packet_get_nb_frames(data, length);
+
+      if (frames_number <= 0)
+        return false; // Invalid packet header.
+      int32_t samples = opus_packet_get_samples_per_frame(data,
+                                                          (opus_int32) rate);
+      int32_t frames = frames_number*samples;
+
+      // A valid Opus packet must be between 2.5 and 120 ms long.
+      if (frames < 120 || frames > 5760)
+        return false;
+      nsAutoArrayPtr<AudioDataValue> buffer(new AudioDataValue[frames * channels]);
+
+      // Decode to the appropriate sample type.
+#ifdef MOZ_SAMPLE_TYPE_FLOAT32
+      int ret = opus_multistream_decode_float(mOpusDecoder,
+                                              data, length,
+                                              buffer, frames, false);
+#else
+      int ret = opus_multistream_decode(mOpusDecoder,
+                                        data, length,
+                                        buffer, frames, false);
+#endif
+      if (ret < 0)
+        return false;
+      NS_ASSERTION(ret == frames, "Opus decoded too few audio samples");
+
+      // Trim the initial frames while the decoder is settling.
+      if (mSkip > 0) {
+        int32_t skipFrames = std::min(mSkip, frames);
+        if (skipFrames == frames) {
+          // discard the whole packet
+          mSkip -= frames;
+          LOG(PR_LOG_DEBUG, ("Opus decoder skipping %d frames"
+                             " (whole packet)", frames));
+          return true;
+        }
+        int32_t keepFrames = frames - skipFrames;
+        int samples = keepFrames * channels;
+        nsAutoArrayPtr<AudioDataValue> trimBuffer(new AudioDataValue[samples]);
+        for (int i = 0; i < samples; i++)
+          trimBuffer[i] = buffer[skipFrames*channels + i];
+
+        frames = keepFrames;
+        buffer = trimBuffer;
+
+        mSkip -= skipFrames;
+        LOG(PR_LOG_DEBUG, ("Opus decoder skipping %d frames", skipFrames));
+      }
+
+      int64_t discardPadding = 0;
+      r = nestegg_packet_discard_padding(aPacket, &discardPadding);
+      if (discardPadding > 0) {
+        CheckedInt64 discardFrames = UsecsToFrames(discardPadding * NS_PER_USEC, rate);
+        if (!discardFrames.isValid()) {
+          NS_WARNING("Int overflow in DiscardPadding");
+          return false;
+        }
+        int32_t keepFrames = frames - discardFrames.value();
+        if (keepFrames > 0) {
+          int samples = keepFrames * channels;
+          nsAutoArrayPtr<AudioDataValue> trimBuffer(new AudioDataValue[samples]);
+          for (int i = 0; i < samples; i++)
+            trimBuffer[i] = buffer[i];
+          frames = keepFrames;
+          buffer = trimBuffer;
+        } else {
+          LOG(PR_LOG_DEBUG, ("Opus decoder discarding whole packet"
+                             " ( %d frames) as padding", frames));
+          return true;
+        }
+      }
+
+      // Apply the header gain if one was specified.
+#ifdef MOZ_SAMPLE_TYPE_FLOAT32
+      if (mOpusParser->mGain != 1.0f) {
+        float gain = mOpusParser->mGain;
+        int samples = frames * channels;
+        for (int i = 0; i < samples; i++) {
+          buffer[i] *= gain;
+        }
+      }
+#else
+      if (mOpusParser->mGain_Q16 != 65536) {
+        int64_t gain_Q16 = mOpusParser->mGain_Q16;
+        int samples = frames * channels;
+        for (int i = 0; i < samples; i++) {
+          int32_t val = static_cast<int32_t>((gain_Q16*buffer[i] + 32768)>>16);
+          buffer[i] = static_cast<AudioDataValue>(MOZ_CLIP_TO_15(val));
+        }
+      }
+#endif
+
+      // More than 2 decoded channels must be downmixed to stereo.
+      if (channels > 2) {
+        // Opus doesn't provide a channel mapping for more than 8 channels,
+        // so we can't downmix more than that.
+        if (channels > 8)
+          return false;
+        OggReader::DownmixToStereo(buffer, channels, frames);
      }

      CheckedInt64 duration = FramesToUsecs(frames, rate);
@ -499,30 +699,25 @@ bool WebMReader::DecodeAudioPacket(nestegg_packet* aPacket, int64_t aOffset)
        NS_WARNING("Int overflow converting WebM audio duration");
        return false;
      }
-      CheckedInt64 total_duration = FramesToUsecs(total_frames, rate);
-      if (!total_duration.isValid()) {
-        NS_WARNING("Int overflow converting WebM audio total_duration");
-        return false;
-      }
-      
-      CheckedInt64 time = total_duration + tstamp_usecs;
+
+      CheckedInt64 time = tstamp_usecs;
      if (!time.isValid()) {
        NS_WARNING("Int overflow adding total_duration and tstamp_usecs");
        nestegg_free_packet(aPacket);
        return false;
      };

-      total_frames += frames;
-      AudioQueue().Push(new AudioData(aOffset,
+      AudioQueue().Push(new AudioData(mDecoder->GetResource()->Tell(),
                                     time.value(),
                                     duration.value(),
                                     frames,
                                     buffer.forget(),
                                     mChannels));
+
      mAudioFrames += frames;
-      if (vorbis_synthesis_read(&mVorbisDsp, frames) != 0) {
-        return false;
-      }
+#else
+      return false;
+#endif /* MOZ_OPUS */
    }
  }

--- a/content/media/webm/WebMReader.h
+++ b/content/media/webm/WebMReader.h
@ -22,6 +22,10 @@
 #include "vorbis/codec.h"
 #endif

+#ifdef MOZ_OPUS
+#include "OpusParser.h"
+#endif
+
 namespace mozilla {

 class WebMBufferedState;
@ -154,6 +158,11 @@ protected:
                           bool aEOS,
                           int64_t aGranulepos);

+#ifdef MOZ_OPUS
+  // Setup opus decoder
+  bool InitOpusDecoder();
+#endif
+
  // Decode a nestegg packet of audio data. Push the audio data on the
  // audio queue. Returns true when there's more audio to decode,
  // false if the audio is finished, end of file has been reached,
@ -182,6 +191,14 @@ private:
  uint32_t mPacketCount;
  uint32_t mChannels;

+
+#ifdef MOZ_OPUS
+  // Opus decoder state
+  nsAutoPtr<OpusParser> mOpusParser;
+  OpusMSDecoder *mOpusDecoder;
+  int mSkip;        // Number of samples left to trim before playback.
+#endif
+
  // Queue of video and audio packets that have been read but not decoded. These
  // must only be accessed from the state machine thread.
  WebMPacketQueue mVideoPackets;
@ -197,6 +214,9 @@ private:
  // Number of audio frames we've decoded since decoding began at mAudioStartMs.
  uint64_t mAudioFrames;

+  // Number of nanoseconds that must be discarded from the start of the Stream.
+  uint64_t mCodecDelay;
+
  // Parser state and computed offset-time mappings.  Shared by multiple
  // readers when decoder has been cloned.  Main thread only.
  nsRefPtr<WebMBufferedState> mBufferedState;
@ -211,6 +231,10 @@ private:
  // Booleans to indicate if we have audio and/or video data
  bool mHasVideo;
  bool mHasAudio;
+
+  // Codec ID of audio track
+  int mAudioCodec;
+
 };

 } // namespace mozilla