From 9a7876feb4eee3f7a7acdd129655ef3ccf799f91 Mon Sep 17 00:00:00 2001 From: Jean-Yves Avenard Date: Wed, 10 Jun 2015 19:38:13 +1000 Subject: [PATCH] Bug 1168040: Part2. Properly handle MP4 time offset in MoofParser. r=kentuckyfriedtakahe --- dom/media/test/manifest.js | 2 +- media/libstagefright/binding/MP4Metadata.cpp | 15 +++- media/libstagefright/binding/MoofParser.cpp | 90 +++++++++++-------- .../binding/include/mp4_demuxer/MoofParser.h | 52 ++++++----- 4 files changed, 95 insertions(+), 64 deletions(-) diff --git a/dom/media/test/manifest.js b/dom/media/test/manifest.js index 2e36edc6cdf4..6e664cffac9a 100644 --- a/dom/media/test/manifest.js +++ b/dom/media/test/manifest.js @@ -228,7 +228,7 @@ var gPlayTests = [ { name:"gizmo.mp4", type:"video/mp4", duration:5.56 }, // Test playback of a MP4 file with a non-zero start time (and audio starting // a second later). - { name:"bipbop-lateaudio.mp4", type:"video/mp4", duration:2.401 }, + { name:"bipbop-lateaudio.mp4", type:"video/mp4" }, { name:"small-shot.m4a", type:"audio/mp4", duration:0.29 }, { name:"small-shot.mp3", type:"audio/mpeg", duration:0.27 }, diff --git a/media/libstagefright/binding/MP4Metadata.cpp b/media/libstagefright/binding/MP4Metadata.cpp index 05e49ddf371f..db5167cb5a3b 100644 --- a/media/libstagefright/binding/MP4Metadata.cpp +++ b/media/libstagefright/binding/MP4Metadata.cpp @@ -68,7 +68,8 @@ private: static inline bool ConvertIndex(FallibleTArray& aDest, - const stagefright::Vector& aIndex) + const stagefright::Vector& aIndex, + int64_t aMediaTime) { if (!aDest.SetCapacity(aIndex.size())) { return false; @@ -78,8 +79,8 @@ ConvertIndex(FallibleTArray& aDest, const stagefright::MediaSource::Indice& s_indice = aIndex[i]; indice.start_offset = s_indice.start_offset; indice.end_offset = s_indice.end_offset; - indice.start_composition = s_indice.start_composition; - indice.end_composition = s_indice.end_composition; + indice.start_composition = s_indice.start_composition - aMediaTime; + indice.end_composition = s_indice.end_composition - aMediaTime; indice.sync = s_indice.sync; MOZ_ALWAYS_TRUE(aDest.AppendElement(indice)); } @@ -248,7 +249,13 @@ MP4Metadata::ReadTrackIndex(FallibleTArray& aDest, mozilla::Track if (!track.get() || track->start() != OK) { return false; } - bool rv = ConvertIndex(aDest, track->exportIndex()); + sp metadata = + mPrivate->mMetadataExtractor->getTrackMetaData(trackNumber); + int64_t mediaTime; + if (!metadata->findInt64(kKeyMediaTime, &mediaTime)) { + mediaTime = 0; + } + bool rv = ConvertIndex(aDest, track->exportIndex(), mediaTime); track->stop(); diff --git a/media/libstagefright/binding/MoofParser.cpp b/media/libstagefright/binding/MoofParser.cpp index f00cb99ebb5f..276daceb79f2 100644 --- a/media/libstagefright/binding/MoofParser.cpp +++ b/media/libstagefright/binding/MoofParser.cpp @@ -48,7 +48,7 @@ MoofParser::RebuildFragmentedIndex(BoxContext& aContext) mInitRange = MediaByteRange(0, box.Range().mEnd); ParseMoov(box); } else if (box.IsType("moof")) { - Moof moof(box, mTrex, mMdhd, mEdts, mSinf, mIsAudio); + Moof moof(box, mTrex, mMvhd, mMdhd, mEdts, mSinf, mIsAudio); if (!moof.IsValid() && !box.Next().IsAvailable()) { // Moof isn't valid abort search for now. @@ -171,7 +171,9 @@ void MoofParser::ParseMoov(Box& aBox) { for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) { - if (box.IsType("trak")) { + if (box.IsType("mvhd")) { + mMvhd = Mvhd(box); + } else if (box.IsType("trak")) { ParseTrak(box); } else if (box.IsType("mvex")) { ParseMvex(box); @@ -190,7 +192,8 @@ MoofParser::ParseTrak(Box& aBox) if (!mTrex.mTrackId || tkhd.mTrackId == mTrex.mTrackId) { ParseMdia(box, tkhd); } - } else if (box.IsType("edts")) { + } else if (box.IsType("edts") && + (!mTrex.mTrackId || tkhd.mTrackId == mTrex.mTrackId)) { mEdts = Edts(box); } } @@ -268,13 +271,13 @@ MoofParser::ParseEncrypted(Box& aBox) } } -Moof::Moof(Box& aBox, Trex& aTrex, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio) +Moof::Moof(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio) : mRange(aBox.Range()) , mMaxRoundingError(35000) { for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) { if (box.IsType("traf")) { - ParseTraf(box, aTrex, aMdhd, aEdts, aSinf, aIsAudio); + ParseTraf(box, aTrex, aMvhd, aMdhd, aEdts, aSinf, aIsAudio); } } if (IsValid()) { @@ -347,7 +350,7 @@ Moof::ProcessCenc() } void -Moof::ParseTraf(Box& aBox, Trex& aTrex, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio) +Moof::ParseTraf(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio) { Tfhd tfhd(aTrex); Tfdt tfdt; @@ -375,7 +378,7 @@ Moof::ParseTraf(Box& aBox, Trex& aTrex, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, b uint64_t decodeTime = tfdt.mBaseMediaDecodeTime; for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) { if (box.IsType("trun")) { - if (ParseTrun(box, tfhd, aMdhd, aEdts, &decodeTime, aIsAudio)) { + if (ParseTrun(box, tfhd, aMvhd, aMdhd, aEdts, &decodeTime, aIsAudio)) { mValid = true; } else { mValid = false; @@ -408,11 +411,12 @@ public: }; bool -Moof::ParseTrun(Box& aBox, Tfhd& aTfhd, Mdhd& aMdhd, Edts& aEdts, uint64_t* aDecodeTime, bool aIsAudio) +Moof::ParseTrun(Box& aBox, Tfhd& aTfhd, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, uint64_t* aDecodeTime, bool aIsAudio) { - if (!aTfhd.IsValid() || !aMdhd.IsValid() || !aEdts.IsValid()) { - LOG(Moof, "Invalid dependencies: aTfhd(%d) aMdhd(%d) aEdts(%d)", - aTfhd.IsValid(), aMdhd.IsValid(), !aEdts.IsValid()); + if (!aTfhd.IsValid() || !aMvhd.IsValid() || !aMdhd.IsValid() || + !aEdts.IsValid()) { + LOG(Moof, "Invalid dependencies: aTfhd(%d) aMvhd(%d) aMdhd(%d) aEdts(%d)", + aTfhd.IsValid(), aMvhd.IsValid(), aMdhd.IsValid(), !aEdts.IsValid()); return false; } @@ -484,8 +488,8 @@ Moof::ParseTrun(Box& aBox, Tfhd& aTfhd, Mdhd& aMdhd, Edts& aEdts, uint64_t* aDec sample.mDecodeTime = aMdhd.ToMicroseconds(decodeTime); sample.mCompositionRange = Interval( - aMdhd.ToMicroseconds((int64_t)decodeTime + ctsOffset - aEdts.mMediaStart), - aMdhd.ToMicroseconds((int64_t)decodeTime + ctsOffset + sampleDuration - aEdts.mMediaStart)); + aMdhd.ToMicroseconds((int64_t)decodeTime + ctsOffset - aEdts.mMediaStart) + aMvhd.ToMicroseconds(aEdts.mEmptyOffset), + aMdhd.ToMicroseconds((int64_t)decodeTime + ctsOffset + sampleDuration - aEdts.mMediaStart) + aMvhd.ToMicroseconds(aEdts.mEmptyOffset)); decodeTime += sampleDuration; // Sometimes audio streams don't properly mark their samples as keyframes, @@ -551,7 +555,7 @@ Tkhd::Tkhd(Box& aBox) mValid = true; } -Mdhd::Mdhd(Box& aBox) +Mvhd::Mvhd(Box& aBox) { BoxReader reader(aBox); if (!reader->CanReadType()) { @@ -561,9 +565,9 @@ Mdhd::Mdhd(Box& aBox) uint32_t flags = reader->ReadU32(); uint8_t version = flags >> 24; size_t need = - 3*(version ? sizeof(int64_t) : sizeof(int32_t)) + 2*sizeof(uint32_t); + 3*(version ? sizeof(int64_t) : sizeof(int32_t)) + sizeof(uint32_t); if (reader->Remaining() < need) { - LOG(Mdhd, "Incomplete Box (have:%lld need:%lld)", + LOG(Mvhd, "Incomplete Box (have:%lld need:%lld)", (uint64_t)reader->Remaining(), (uint64_t)need); return; } @@ -578,12 +582,18 @@ Mdhd::Mdhd(Box& aBox) mModificationTime = reader->ReadU64(); mTimescale = reader->ReadU32(); mDuration = reader->ReadU64(); + } else { + reader->DiscardRemaining(); + return; } - // language and pre_defined=0 - reader->ReadU32(); - if (mTimescale) { - mValid = true; - } + // More stuff that we don't care about + reader->DiscardRemaining(); + mValid = true; +} + +Mdhd::Mdhd(Box& aBox) + : Mvhd(aBox) +{ } Trex::Trex(Box& aBox) @@ -672,6 +682,7 @@ Tfdt::Tfdt(Box& aBox) Edts::Edts(Box& aBox) : mMediaStart(0) + , mEmptyOffset(0) { Box child = aBox.FirstChild(); if (!child.IsType("elst")) { @@ -692,22 +703,31 @@ Edts::Edts(Box& aBox) (uint64_t)reader->Remaining(), (uint64_t)need); return; } + bool emptyEntry = false; uint32_t entryCount = reader->ReadU32(); - NS_ASSERTION(entryCount == 1, "Can't handle videos with multiple edits"); - if (entryCount != 1) { - reader->DiscardRemaining(); - return; + for (uint32_t i = 0; i < entryCount; i++) { + uint64_t segment_duration; + int64_t media_time; + if (version == 1) { + segment_duration = reader->ReadU64(); + media_time = reader->Read64(); + } else { + segment_duration = reader->ReadU32(); + media_time = reader->Read32(); + } + if (media_time == -1 && i) { + LOG(Edts, "Multiple empty edit, not handled"); + } else if (media_time == -1) { + mEmptyOffset = segment_duration; + emptyEntry = true; + } else if (i > 1 || (i > 0 && !emptyEntry)) { + LOG(Edts, "More than one edit entry, not handled. A/V sync will be wrong"); + break; + } else { + mMediaStart = media_time; + } + reader->ReadU32(); // media_rate_integer and media_rate_fraction } - - uint64_t segment_duration; - if (version == 1) { - segment_duration = reader->ReadU64(); - mMediaStart = reader->Read64(); - } else { - segment_duration = reader->ReadU32(); - mMediaStart = reader->Read32(); - } - reader->DiscardRemaining(); } Saiz::Saiz(Box& aBox, AtomType aDefaultType) diff --git a/media/libstagefright/binding/include/mp4_demuxer/MoofParser.h b/media/libstagefright/binding/include/mp4_demuxer/MoofParser.h index 56ef94f95331..a87703560cba 100644 --- a/media/libstagefright/binding/include/mp4_demuxer/MoofParser.h +++ b/media/libstagefright/binding/include/mp4_demuxer/MoofParser.h @@ -18,35 +18,17 @@ class Box; class BoxContext; class Moof; -class Tkhd : public Atom +class Mvhd : public Atom { public: - Tkhd() - : mCreationTime(0) - , mModificationTime(0) - , mTrackId(0) - , mDuration(0) - { - } - explicit Tkhd(Box& aBox); - - uint64_t mCreationTime; - uint64_t mModificationTime; - uint32_t mTrackId; - uint64_t mDuration; -}; - -class Mdhd : public Atom -{ -public: - Mdhd() + Mvhd() : mCreationTime(0) , mModificationTime(0) , mTimescale(0) , mDuration(0) { } - explicit Mdhd(Box& aBox); + explicit Mvhd(Box& aBox); Microseconds ToMicroseconds(int64_t aTimescaleUnits) { @@ -59,6 +41,25 @@ public: uint64_t mDuration; }; +class Tkhd : public Mvhd +{ +public: + Tkhd() + : mTrackId(0) + { + } + explicit Tkhd(Box& aBox); + + uint32_t mTrackId; +}; + +class Mdhd : public Mvhd +{ +public: + Mdhd() = default; + explicit Mdhd(Box& aBox); +}; + class Trex : public Atom { public: @@ -113,6 +114,7 @@ class Edts : public Atom public: Edts() : mMediaStart(0) + , mEmptyOffset(0) { } explicit Edts(Box& aBox); @@ -123,6 +125,7 @@ public: } int64_t mMediaStart; + int64_t mEmptyOffset; }; struct Sample @@ -168,7 +171,7 @@ private: class Moof : public Atom { public: - Moof(Box& aBox, Trex& aTrex, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio); + Moof(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio); bool GetAuxInfo(AtomType aType, nsTArray* aByteRanges); void FixRounding(const Moof& aMoof); @@ -181,9 +184,9 @@ public: nsTArray mSaios; private: - void ParseTraf(Box& aBox, Trex& aTrex, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio); + void ParseTraf(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf, bool aIsAudio); // aDecodeTime is updated to the end of the parsed TRUN on return. - bool ParseTrun(Box& aBox, Tfhd& aTfhd, Mdhd& aMdhd, Edts& aEdts, uint64_t* aDecodeTime, bool aIsAudio); + bool ParseTrun(Box& aBox, Tfhd& aTfhd, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, uint64_t* aDecodeTime, bool aIsAudio); void ParseSaiz(Box& aBox); void ParseSaio(Box& aBox); bool ProcessCenc(); @@ -227,6 +230,7 @@ public: nsRefPtr mSource; uint64_t mOffset; nsTArray mMoofOffsets; + Mvhd mMvhd; Mdhd mMdhd; Trex mTrex; Tfdt mTfdt;