From ca960ebd528a5da8c65da56fb5f979568dd0379b Mon Sep 17 00:00:00 2001
From: alwu <alwu@mozilla.com>
Date: Sat, 5 Nov 2022 02:23:33 +0000
Subject: [PATCH] Bug 1262276 - part12 : store looping offset in the media
 queue in order to keep timestamp consistantly increasing across different
 states. r=padenot

When leaving looping state to another state, media data stored in the
media queue are already adjusted. If the new state requests a new data
but doesn't adjust its timestamp, then the data in the media queue will
be out of order.

If that happens on video data, it would causes a/v unsync and the video
frame would be discarded because it doesn't catch up with the clock
time, which might have grown a lot via looping multiple times.

The example transitions from the looping state which would encounter
this situation are buffering state (decoding too slow), decoding state
(cancel looping) and video only seek (bkg video resume).

In the premise of letting the clock time keep growing, we would need to
put the offset to somewhere independent to states. Therefore, we choose
to let the media queue do the task of the timestamp adjustment.

So even if we leave the looping state, the new coming data would be
adjusted their timestamp correctly and match the clock time. If we enter
the looping state again, we can also smoothly keep adding more offset to
all future data.

Differential Revision: https://phabricator.services.mozilla.com/D160576
---
 dom/media/MediaDecoderStateMachine.cpp | 119 ++++++++++---------------
 dom/media/MediaQueue.h                 |  77 ++++++++++++++++
 dom/media/gtest/TestMediaQueue.cpp     |  93 ++++++++++++++++++-
 3 files changed, 214 insertions(+), 75 deletions(-)

diff --git a/dom/media/MediaDecoderStateMachine.cpp b/dom/media/MediaDecoderStateMachine.cpp
index 99510c56f338..e058183246f6 100644
--- a/dom/media/MediaDecoderStateMachine.cpp
+++ b/dom/media/MediaDecoderStateMachine.cpp
@@ -871,6 +871,20 @@ class MediaDecoderStateMachine::LoopingDecodingState
         mVideoEndedBeforeEnteringStateWithoutDuration = true;
       }
     }
+
+    // If we've looped at least once before, the master's media queues have
+    // already stored some adjusted data. If a track has reached EOS, we need to
+    // update queue offset correctly. Otherwise, it would cause a/v unsync.
+    if (mMaster->mOriginalDecodedDuration != media::TimeUnit::Zero()) {
+      if (mIsReachingAudioEOS && mMaster->HasAudio()) {
+        AudioQueue().SetOffset(AudioQueue().GetOffset() +
+                               mMaster->mOriginalDecodedDuration);
+      }
+      if (mIsReachingVideoEOS && mMaster->HasVideo()) {
+        VideoQueue().SetOffset(VideoQueue().GetOffset() +
+                               mMaster->mOriginalDecodedDuration);
+      }
+    }
   }
 
   void Enter() {
@@ -890,8 +904,8 @@ class MediaDecoderStateMachine::LoopingDecodingState
     SLOG("Leaving looping state, offset [a=%" PRId64 ",v=%" PRId64
          "], endtime [a=%" PRId64 ",v=%" PRId64 "], track duration [a=%" PRId64
          ",v=%" PRId64 "], waiting=%s",
-         mAudioLoopingOffset.ToMicroseconds(),
-         mVideoLoopingOffset.ToMicroseconds(),
+         AudioQueue().GetOffset().ToMicroseconds(),
+         VideoQueue().GetOffset().ToMicroseconds(),
          mMaster->mDecodedAudioEndTime.ToMicroseconds(),
          mMaster->mDecodedVideoEndTime.ToMicroseconds(),
          mMaster->mAudioTrackDecodedDuration
@@ -935,30 +949,24 @@ class MediaDecoderStateMachine::LoopingDecodingState
 
   void HandleAudioDecoded(AudioData* aAudio) override {
     // TODO : check if we need to update mOriginalDecodedDuration
-    MediaResult rv = LoopingAudioTimeAdjustment(aAudio);
-    if (NS_WARN_IF(NS_FAILED(rv))) {
-      mMaster->DecodeError(rv);
-      return;
-    }
+
+    // After pushing data to the queue, timestamp might be adjusted.
+    DecodingState::HandleAudioDecoded(aAudio);
     mMaster->mDecodedAudioEndTime =
         std::max(aAudio->GetEndTime(), mMaster->mDecodedAudioEndTime);
     SLOG("audio sample after time-adjustment [%" PRId64 ",%" PRId64 "]",
          aAudio->mTime.ToMicroseconds(), aAudio->GetEndTime().ToMicroseconds());
-    DecodingState::HandleAudioDecoded(aAudio);
   }
 
   void HandleVideoDecoded(VideoData* aVideo) override {
     // TODO : check if we need to update mOriginalDecodedDuration
-    MediaResult rv = LoopingVideoTimeAdjustment(aVideo);
-    if (NS_WARN_IF(NS_FAILED(rv))) {
-      mMaster->DecodeError(rv);
-      return;
-    }
+
+    // After pushing data to the queue, timestamp might be adjusted.
+    DecodingState::HandleVideoDecoded(aVideo);
     mMaster->mDecodedVideoEndTime =
         std::max(aVideo->GetEndTime(), mMaster->mDecodedVideoEndTime);
     SLOG("video sample after time-adjustment [%" PRId64 ",%" PRId64 "]",
          aVideo->mTime.ToMicroseconds(), aVideo->GetEndTime().ToMicroseconds());
-    DecodingState::HandleVideoDecoded(aVideo);
   }
 
   void HandleEndOfAudio() override {
@@ -969,14 +977,15 @@ class MediaDecoderStateMachine::LoopingDecodingState
           mMaster->mDecodedAudioEndTime);
     }
     if (DetermineOriginalDecodedDurationIfNeeded()) {
-      mAudioLoopingOffset += mMaster->mOriginalDecodedDuration;
+      AudioQueue().SetOffset(AudioQueue().GetOffset() +
+                             mMaster->mOriginalDecodedDuration);
     }
 
     SLOG(
         "received audio EOS when seamless looping, starts seeking, "
-        "mAudioLoopingOffset=[%" PRId64
-        "], mAudioTrackDecodedDuration=[%" PRId64 "]",
-        mAudioLoopingOffset.ToMicroseconds(),
+        "audioLoopingOffset=[%" PRId64 "], mAudioTrackDecodedDuration=[%" PRId64
+        "]",
+        AudioQueue().GetOffset().ToMicroseconds(),
         mMaster->mAudioTrackDecodedDuration->ToMicroseconds());
     RequestDataFromStartPosition(TrackInfo::TrackType::kAudioTrack);
     ProcessSamplesWaitingAdjustmentIfAny();
@@ -990,14 +999,15 @@ class MediaDecoderStateMachine::LoopingDecodingState
           mMaster->mDecodedVideoEndTime);
     }
     if (DetermineOriginalDecodedDurationIfNeeded()) {
-      mVideoLoopingOffset += mMaster->mOriginalDecodedDuration;
+      VideoQueue().SetOffset(VideoQueue().GetOffset() +
+                             mMaster->mOriginalDecodedDuration);
     }
 
     SLOG(
         "received video EOS when seamless looping, starts seeking, "
-        "mVideoLoopingOffset=[%" PRId64
-        "], mVideoTrackDecodedDuration=[%" PRId64 "]",
-        mVideoLoopingOffset.ToMicroseconds(),
+        "videoLoopingOffset=[%" PRId64 "], mVideoTrackDecodedDuration=[%" PRId64
+        "]",
+        VideoQueue().GetOffset().ToMicroseconds(),
         mMaster->mVideoTrackDecodedDuration->ToMicroseconds());
     RequestDataFromStartPosition(TrackInfo::TrackType::kVideoTrack);
     ProcessSamplesWaitingAdjustmentIfAny();
@@ -1168,10 +1178,6 @@ class MediaDecoderStateMachine::LoopingDecodingState
       return;
     }
 
-    MOZ_ASSERT_IF(mIsReachingAudioEOS,
-                  mAudioLoopingOffset == media::TimeUnit::Zero());
-    MOZ_ASSERT_IF(mIsReachingVideoEOS,
-                  mVideoLoopingOffset == media::TimeUnit::Zero());
     // If we have already reached EOS before starting media sink, the sink
     // has not started yet and the current position is larger than last decoded
     // end time, that means we directly seeked to EOS and playback would start
@@ -1282,9 +1288,19 @@ class MediaDecoderStateMachine::LoopingDecodingState
     SLOG("process %s sample waiting for timestamp adjustment",
          isAudio ? "audio" : "video");
     if (isAudio) {
+      // Waiting sample is for next round of looping, so the queue offset
+      // shouldn't be zero. This happens when the track has reached EOS before
+      // entering the state (and looping never happens before). Same for below
+      // video case.
+      if (AudioQueue().GetOffset() == media::TimeUnit::Zero()) {
+        AudioQueue().SetOffset(mMaster->mOriginalDecodedDuration);
+      }
       HandleAudioDecoded(data->As<AudioData>());
     } else {
       MOZ_DIAGNOSTIC_ASSERT(data->mType == MediaData::Type::VIDEO_DATA);
+      if (VideoQueue().GetOffset() == media::TimeUnit::Zero()) {
+        VideoQueue().SetOffset(mMaster->mOriginalDecodedDuration);
+      }
       HandleVideoDecoded(data->As<VideoData>());
     }
   }
@@ -1324,42 +1340,6 @@ class MediaDecoderStateMachine::LoopingDecodingState
     MaybeStopPrerolling();
   }
 
-  MediaResult LoopingAudioTimeAdjustment(AudioData* aAudio) {
-    // `mOriginalDecodedDuration` can only be determined after we know both
-    // tracks' original duration. This case happens when audio track reaches EOS
-    // before entering the state, so we haven't set its offset yet.
-    if (mAudioLoopingOffset == media::TimeUnit::Zero() &&
-        mMaster->mOriginalDecodedDuration != media::TimeUnit::Zero()) {
-      mAudioLoopingOffset += mMaster->mOriginalDecodedDuration;
-    }
-    if (mAudioLoopingOffset != media::TimeUnit::Zero()) {
-      aAudio->mTime += mAudioLoopingOffset;
-    }
-    return aAudio->mTime.IsValid()
-               ? MediaResult(NS_OK)
-               : MediaResult(
-                     NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
-                     "Audio sample overflow during looping time adjustment");
-  }
-
-  MediaResult LoopingVideoTimeAdjustment(VideoData* aVideo) {
-    // `mOriginalDecodedDuration` can only be determined after we know both
-    // tracks' original duration. This case happens when audio track reaches EOS
-    // before entering the state, so we haven't set its offset yet.
-    if (mVideoLoopingOffset == media::TimeUnit::Zero() &&
-        mMaster->mOriginalDecodedDuration != media::TimeUnit::Zero()) {
-      mVideoLoopingOffset += mMaster->mOriginalDecodedDuration;
-    }
-    if (mVideoLoopingOffset != media::TimeUnit::Zero()) {
-      aVideo->mTime += mVideoLoopingOffset;
-    }
-    return aVideo->mTime.IsValid()
-               ? MediaResult(NS_OK)
-               : MediaResult(
-                     NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
-                     "Video sample overflow during looping time adjustment");
-  }
-
   bool ShouldDiscardLoopedData(MediaData::Type aType) const {
     if (!mMaster->mMediaSink->IsStarted()) {
       return false;
@@ -1387,7 +1367,8 @@ class MediaDecoderStateMachine::LoopingDecodingState
      *    ClockTime        offset      mDecodedXXXEndTime
      *
      */
-    const auto offset = isAudio ? mAudioLoopingOffset : mVideoLoopingOffset;
+    const auto offset =
+        isAudio ? AudioQueue().GetOffset() : VideoQueue().GetOffset();
     const auto endTime =
         isAudio ? mMaster->mDecodedAudioEndTime : mMaster->mDecodedVideoEndTime;
     const auto clockTime = mMaster->GetClock();
@@ -1399,7 +1380,8 @@ class MediaDecoderStateMachine::LoopingDecodingState
     MOZ_DIAGNOSTIC_ASSERT(aType == MediaData::Type::AUDIO_DATA ||
                           aType == MediaData::Type::VIDEO_DATA);
     const bool isAudio = aType == MediaData::Type::AUDIO_DATA;
-    const auto offset = isAudio ? mAudioLoopingOffset : mVideoLoopingOffset;
+    const auto offset =
+        isAudio ? AudioQueue().GetOffset() : VideoQueue().GetOffset();
     if (offset == media::TimeUnit::Zero()) {
       return;
     }
@@ -1494,15 +1476,6 @@ class MediaDecoderStateMachine::LoopingDecodingState
    */
   RefPtr<MediaData> mDataWaitingTimestampAdjustment;
 
-  // The accumuated offset after looping to the start position for tracks.
-  // Eg. Media duration 10, and we've looped 5 times, offset will be 50.
-  // Note, most of time they will be the same when we have both tracks, but we
-  // separate them in order to handle the case where both tracks reach EOS at
-  // different time. Eg. media duration 10, if audio track reaches to EOS
-  // already, but video hasn't. Then audio offset is 10, but video is 0.
-  media::TimeUnit mAudioLoopingOffset = media::TimeUnit::Zero();
-  media::TimeUnit mVideoLoopingOffset = media::TimeUnit::Zero();
-
   MozPromiseRequestHolder<MediaFormatReader::SeekPromise> mAudioSeekRequest;
   MozPromiseRequestHolder<MediaFormatReader::SeekPromise> mVideoSeekRequest;
   MozPromiseRequestHolder<AudioDataPromise> mAudioDataRequest;
diff --git a/dom/media/MediaQueue.h b/dom/media/MediaQueue.h
index 3fd9049fb9f4..4c05745d0a06 100644
--- a/dom/media/MediaQueue.h
+++ b/dom/media/MediaQueue.h
@@ -17,7 +17,34 @@
 
 namespace mozilla {
 
+extern LazyLogModule gMediaDecoderLog;
+
+#  define QLOG(msg, ...)                       \
+    MOZ_LOG(gMediaDecoderLog, LogLevel::Debug, \
+            ("MediaQueue=%p " msg, this, ##__VA_ARGS__))
+
 class AudioData;
+class VideoData;
+
+template <typename T>
+struct TimestampAdjustmentTrait {
+  static const bool mValue = false;
+};
+
+template <>
+struct TimestampAdjustmentTrait<AudioData> {
+  static const bool mValue = true;
+};
+
+template <>
+struct TimestampAdjustmentTrait<VideoData> {
+  static const bool mValue = true;
+};
+
+template <typename T>
+struct NonTimestampAdjustmentTrait {
+  static const bool mValue = !TimestampAdjustmentTrait<T>::mValue;
+};
 
 template <class T>
 class MediaQueue : private nsRefPtrDeque<T> {
@@ -34,8 +61,34 @@ class MediaQueue : private nsRefPtrDeque<T> {
     return nsRefPtrDeque<T>::GetSize();
   }
 
+  template <typename U,
+            std::enable_if_t<TimestampAdjustmentTrait<U>::mValue, bool> = true>
+  inline void AdjustTimeStampIfNeeded(U* aItem) {
+    static_assert(std::is_same_v<U, AudioData> || std::is_same_v<U, VideoData>);
+    if (mOffset != media::TimeUnit::Zero()) {
+      const auto prev = aItem->mTime, prevEndTime = aItem->GetEndTime();
+      aItem->mTime += mOffset;
+      if (!aItem->mTime.IsValid()) {
+        NS_WARNING("Reverting timestamp adjustment due to sample overflow!");
+        aItem->mTime = prev;
+      } else {
+        QLOG("adjusted %s sample [%" PRId64 ",%" PRId64 "] -> [%" PRId64
+             ",%" PRId64 "]",
+             std::is_same_v<U, AudioData> ? "audio" : "video",
+             prev.ToMicroseconds(), prevEndTime.ToMicroseconds(),
+             aItem->mTime.ToMicroseconds(),
+             aItem->GetEndTime().ToMicroseconds());
+      }
+    }
+  }
+
+  template <typename U, std::enable_if_t<NonTimestampAdjustmentTrait<U>::mValue,
+                                         bool> = true>
+  inline void AdjustTimeStampIfNeeded(U* aItem) {}
+
   inline void PushFront(T* aItem) {
     RecursiveMutexAutoLock lock(mRecursiveMutex);
+    AdjustTimeStampIfNeeded(aItem);
     nsRefPtrDeque<T>::PushFront(aItem);
   }
 
@@ -50,6 +103,7 @@ class MediaQueue : private nsRefPtrDeque<T> {
 
     MOZ_DIAGNOSTIC_ASSERT(item);
     MOZ_DIAGNOSTIC_ASSERT(item->GetEndTime() >= item->mTime);
+    AdjustTimeStampIfNeeded(item);
     nsRefPtrDeque<T>::Push(dont_AddRef(item));
     mPushEvent.Notify(RefPtr<T>(item));
 
@@ -88,6 +142,7 @@ class MediaQueue : private nsRefPtrDeque<T> {
   void Reset() {
     RecursiveMutexAutoLock lock(mRecursiveMutex);
     nsRefPtrDeque<T>::Erase();
+    SetOffset(media::TimeUnit::Zero());
     mEndOfStream = false;
   }
 
@@ -154,6 +209,22 @@ class MediaQueue : private nsRefPtrDeque<T> {
     return frames;
   }
 
+  bool SetOffset(const media::TimeUnit& aOffset) {
+    if (!aOffset.IsValid()) {
+      QLOG("Invalid offset!");
+      return false;
+    }
+    RecursiveMutexAutoLock lock(mRecursiveMutex);
+    mOffset = aOffset;
+    QLOG("Set media queue offset %" PRId64, mOffset.ToMicroseconds());
+    return true;
+  }
+
+  media::TimeUnit GetOffset() const {
+    RecursiveMutexAutoLock lock(mRecursiveMutex);
+    return mOffset;
+  }
+
   MediaEventSource<RefPtr<T>>& PopFrontEvent() { return mPopFrontEvent; }
 
   MediaEventSource<RefPtr<T>>& PushEvent() { return mPushEvent; }
@@ -186,8 +257,14 @@ class MediaQueue : private nsRefPtrDeque<T> {
   // True when we've decoded the last frame of data in the
   // bitstream for which we're queueing frame data.
   bool mEndOfStream;
+  // This offset will be added to any data pushed into the queue. We use it when
+  // the media queue starts receiving looped data, which timestamp needs to be
+  // modified.
+  media::TimeUnit mOffset;
 };
 
 }  // namespace mozilla
 
+#  undef QLOG
+
 #endif
diff --git a/dom/media/gtest/TestMediaQueue.cpp b/dom/media/gtest/TestMediaQueue.cpp
index 5b5a74cae415..5b049dc7fe73 100644
--- a/dom/media/gtest/TestMediaQueue.cpp
+++ b/dom/media/gtest/TestMediaQueue.cpp
@@ -10,10 +10,23 @@
 using namespace mozilla;
 using mozilla::media::TimeUnit;
 
-MediaData* CreateDataRawPtr(int64_t aStartTime, int64_t aEndTime) {
+MediaData* CreateDataRawPtr(
+    int64_t aStartTime, int64_t aEndTime,
+    MediaData::Type aType = MediaData::Type::NULL_DATA) {
   const TimeUnit startTime = TimeUnit::FromMicroseconds(aStartTime);
   const TimeUnit endTime = TimeUnit::FromMicroseconds(aEndTime);
-  return new NullData(0, startTime, endTime - startTime);
+  MediaData* data;
+  if (aType == MediaData::Type::AUDIO_DATA) {
+    AlignedAudioBuffer samples;
+    data = new AudioData(0, startTime, std::move(samples), 2, 44100);
+    data->mDuration = endTime - startTime;
+  } else if (aType == MediaData::Type::VIDEO_DATA) {
+    data = new VideoData(0, startTime, endTime - startTime, true, startTime,
+                         gfx::IntSize(), 0);
+  } else {
+    data = new NullData(0, startTime, endTime - startTime);
+  }
+  return data;
 }
 
 already_AddRefed<MediaData> CreateData(int64_t aStartTime, int64_t aEndTime) {
@@ -196,4 +209,80 @@ TEST(MediaQueue, CallGetElementAfterOnMultipleElements)
   EXPECT_TRUE(emptyResult.IsEmpty());
 }
 
+TEST(MediaQueue, TimestampAdjustmentForSupportDataType)
+{
+  const size_t kOffSet = 30;
+  {
+    MediaQueue<AudioData> audioQueue;
+    audioQueue.Push(
+        CreateDataRawPtr(0, 10, MediaData::Type::AUDIO_DATA)->As<AudioData>());
+    audioQueue.SetOffset(TimeUnit::FromMicroseconds(kOffSet));
+    audioQueue.Push(
+        CreateDataRawPtr(0, 10, MediaData::Type::AUDIO_DATA)->As<AudioData>());
+
+    // Data stored before setting the offset shouldn't be changed
+    RefPtr<AudioData> data = audioQueue.PopFront();
+    EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0));
+    EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10));
+
+    // Data stored after setting the offset should be changed
+    data = audioQueue.PopFront();
+    EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0 + kOffSet));
+    EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10 + kOffSet));
+
+    // Reset will clean the offset.
+    audioQueue.Reset();
+    audioQueue.Push(
+        CreateDataRawPtr(0, 10, MediaData::Type::AUDIO_DATA)->As<AudioData>());
+    data = audioQueue.PopFront();
+    EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0));
+    EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10));
+  }
+
+  // Check another supported type
+  MediaQueue<VideoData> videoQueue;
+  videoQueue.Push(
+      CreateDataRawPtr(0, 10, MediaData::Type::VIDEO_DATA)->As<VideoData>());
+  videoQueue.SetOffset(TimeUnit::FromMicroseconds(kOffSet));
+  videoQueue.Push(
+      CreateDataRawPtr(0, 10, MediaData::Type::VIDEO_DATA)->As<VideoData>());
+
+  // Data stored before setting the offset shouldn't be changed
+  RefPtr<VideoData> data = videoQueue.PopFront();
+  EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0));
+  EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10));
+
+  // Data stored after setting the offset should be changed
+  data = videoQueue.PopFront();
+  EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0 + kOffSet));
+  EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10 + kOffSet));
+
+  // Reset will clean the offset.
+  videoQueue.Reset();
+  videoQueue.Push(
+      CreateDataRawPtr(0, 10, MediaData::Type::VIDEO_DATA)->As<VideoData>());
+  data = videoQueue.PopFront();
+  EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0));
+  EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10));
+}
+
+TEST(MediaQueue, TimestampAdjustmentForNotSupportDataType)
+{
+  const size_t kOffSet = 30;
+
+  MediaQueue<MediaData> queue;
+  queue.Push(CreateDataRawPtr(0, 10));
+  queue.SetOffset(TimeUnit::FromMicroseconds(kOffSet));
+  queue.Push(CreateDataRawPtr(0, 10));
+
+  // Offset won't affect any data at all.
+  RefPtr<MediaData> data = queue.PopFront();
+  EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0));
+  EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10));
+
+  data = queue.PopFront();
+  EXPECT_EQ(data->mTime, TimeUnit::FromMicroseconds(0));
+  EXPECT_EQ(data->GetEndTime(), TimeUnit::FromMicroseconds(10));
+}
+
 #undef EXPECT_EQUAL_SIZE_T