gecko-dev/dom/media/MP3FrameParser.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include <algorithm>

#include "nsMemory.h"
#include "MP3FrameParser.h"
#include "VideoUtils.h"


#define FROM_BIG_ENDIAN(X) ((uint32_t)((uint8_t)(X)[0] << 24 | (uint8_t)(X)[1] << 16 | \
                                       (uint8_t)(X)[2] << 8 | (uint8_t)(X)[3]))


namespace mozilla {

/*
 * Following code taken from http://www.hydrogenaudio.org/forums/index.php?showtopic=85125
 * with permission from the author, Nick Wallette <sirnickity@gmail.com>.
 */

/* BEGIN shameless copy and paste */

// Bitrates - use [version][layer][bitrate]
const uint16_t mpeg_bitrates[4][4][16] = {
  { // Version 2.5
    { 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0 }, // Reserved
    { 0,   8,  16,  24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160, 0 }, // Layer 3
    { 0,   8,  16,  24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160, 0 }, // Layer 2
    { 0,  32,  48,  56,  64,  80,  96, 112, 128, 144, 160, 176, 192, 224, 256, 0 }  // Layer 1
  },
  { // Reserved
    { 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0 }, // Invalid
    { 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0 }, // Invalid
    { 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0 }, // Invalid
    { 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0 }  // Invalid
  },
  { // Version 2
    { 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0 }, // Reserved
    { 0,   8,  16,  24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160, 0 }, // Layer 3
    { 0,   8,  16,  24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160, 0 }, // Layer 2
    { 0,  32,  48,  56,  64,  80,  96, 112, 128, 144, 160, 176, 192, 224, 256, 0 }  // Layer 1
  },
  { // Version 1
    { 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0 }, // Reserved
    { 0,  32,  40,  48,  56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320, 0 }, // Layer 3
    { 0,  32,  48,  56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320, 384, 0 }, // Layer 2
    { 0,  32,  64,  96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 0 }, // Layer 1
  }
};

// Sample rates - use [version][srate]
const uint16_t mpeg_srates[4][4] = {
    { 11025, 12000,  8000, 0 }, // MPEG 2.5
    {     0,     0,     0, 0 }, // Reserved
    { 22050, 24000, 16000, 0 }, // MPEG 2
    { 44100, 48000, 32000, 0 }  // MPEG 1
};

// Samples per frame - use [version][layer]
const uint16_t mpeg_frame_samples[4][4] = {
//    Rsvd     3     2     1  < Layer  v Version
    {    0,  576, 1152,  384 }, //       2.5
    {    0,    0,    0,    0 }, //       Reserved
    {    0,  576, 1152,  384 }, //       2
    {    0, 1152, 1152,  384 }  //       1
};

// Slot size (MPEG unit of measurement) - use [layer]
const uint8_t mpeg_slot_size[4] = { 0, 1, 1, 4 }; // Rsvd, 3, 2, 1

uint16_t
MP3Frame::CalculateLength()
{
  // Lookup real values of these fields
  uint32_t  bitrate   = mpeg_bitrates[mVersion][mLayer][mBitrate] * 1000;
  uint32_t  samprate  = mpeg_srates[mVersion][mSampleRate];
  uint16_t  samples   = mpeg_frame_samples[mVersion][mLayer];
  uint8_t   slot_size = mpeg_slot_size[mLayer];

  // In-between calculations
  float     bps       = (float)samples / 8.0;
  float     fsize     = ( (bps * (float)bitrate) / (float)samprate )
    + ( (mPad) ? slot_size : 0 );

  // Frame sizes are truncated integers
  return (uint16_t)fsize;
}

/* END shameless copy and paste */


/** MP3Parser methods **/

MP3Parser::MP3Parser()
  : mCurrentChar(0)
{ }

void
MP3Parser::Reset()
{
  mCurrentChar = 0;
}

uint16_t
MP3Parser::ParseFrameLength(uint8_t ch)
{
  mData.mRaw[mCurrentChar] = ch;

  MP3Frame &frame = mData.mFrame;

  // Validate MP3 header as we read. We can't mistake the start of an MP3 frame
  // for the middle of another frame due to the sync byte at the beginning
  // of the frame.

  // The only valid position for an all-high byte is the sync byte at the
  // beginning of the frame.
  if (ch == 0xff) {
    mCurrentChar = 0;
  }

  // Make sure the current byte is valid in context. If not, reset the parser.
  if (mCurrentChar == 2) {
    if (frame.mBitrate == 0x0f) {
      goto fail;
    }
  } else if (mCurrentChar == 1) {
    if (frame.mSync2 != 0x07
        || frame.mVersion == 0x01
        || frame.mLayer == 0x00) {
      goto fail;
    }
  }

  // The only valid character at the beginning of the header is 0xff. Fail if
  // it's different.
  if (mCurrentChar == 0 && frame.mSync1 != 0xff) {
    // Couldn't find the sync byte. Fail.
    return 0;
  }

  mCurrentChar++;
  MOZ_ASSERT(mCurrentChar <= sizeof(MP3Frame));

  // Don't have a full header yet.
  if (mCurrentChar < sizeof(MP3Frame)) {
    return 0;
  }

  // Woo, valid header. Return the length.
  mCurrentChar = 0;
  return frame.CalculateLength();

fail:
  Reset();
  return 0;
}

uint32_t
MP3Parser::GetSampleRate()
{
  MP3Frame &frame = mData.mFrame;
  return mpeg_srates[frame.mVersion][frame.mSampleRate];
}

uint32_t
MP3Parser::GetSamplesPerFrame()
{
  MP3Frame &frame = mData.mFrame;
  return mpeg_frame_samples[frame.mVersion][frame.mLayer];
}


/** ID3Parser methods **/

const char sID3Head[3] = { 'I', 'D', '3' };
const uint32_t ID3_HEADER_LENGTH = 10;
const uint32_t ID3_FOOTER_LENGTH = 10;
const uint8_t ID3_FOOTER_PRESENT = 0x10;

ID3Parser::ID3Parser()
  : mCurrentChar(0)
  , mVersion(0)
  , mFlags(0)
  , mHeaderLength(0)
{ }

void
ID3Parser::Reset()
{
  mCurrentChar = mVersion = mFlags = mHeaderLength = 0;
}

bool
ID3Parser::ParseChar(char ch)
{
  switch (mCurrentChar) {
    // The first three bytes of an ID3v2 header must match the string "ID3".
    case 0: case 1: case 2:
      if (ch != sID3Head[mCurrentChar]) {
        goto fail;
      }
      break;
    // The fourth and fifth bytes give the version, between 2 and 4.
    case 3:
      if (ch < '\2' || ch > '\4') {
        goto fail;
      }
      mVersion = uint8_t(ch);
      break;
    case 4:
      if (ch != '\0') {
        goto fail;
      }
      break;
    // The sixth byte gives the flags; valid flags depend on the version.
    case 5:
      if ((ch & (0xff >> mVersion)) != '\0') {
        goto fail;
      }
      mFlags = uint8_t(ch);
      break;
    // Bytes seven through ten give the sum of the byte length of the extended
    // header, the padding and the frames after unsynchronisation.
    // These bytes form a 28-bit integer, with the high bit of each byte unset.
    case 6: case 7: case 8: case 9:
      if (ch & 0x80) {
        goto fail;
      }
      mHeaderLength <<= 7;
      mHeaderLength |= ch;
      if (mCurrentChar == 9) {
        mHeaderLength += ID3_HEADER_LENGTH;
        mHeaderLength += (mFlags & ID3_FOOTER_PRESENT) ? ID3_FOOTER_LENGTH : 0;
      }
      break;
    default:
      MOZ_CRASH("Header already fully parsed!");
  }

  mCurrentChar++;

  return IsParsed();

fail:
  if (mCurrentChar) {
    Reset();
    return ParseChar(ch);
  }
  Reset();
  return false;
}

bool
ID3Parser::IsParsed() const
{
  return mCurrentChar >= ID3_HEADER_LENGTH;
}

uint32_t
ID3Parser::GetHeaderLength() const
{
  MOZ_ASSERT(IsParsed(),
             "Queried length of ID3 header before parsing finished.");
  return mHeaderLength;
}


/** VBR header helper stuff **/

// Helper function to find a VBR header in an MP3 frame.
// Based on information from
// http://www.codeproject.com/Articles/8295/MPEG-Audio-Frame-Header

const uint32_t VBRI_TAG = FROM_BIG_ENDIAN("VBRI");
const uint32_t VBRI_OFFSET = 32 - sizeof(MP3Frame);
const uint32_t VBRI_FRAME_COUNT_OFFSET = VBRI_OFFSET + 14;
const uint32_t VBRI_MIN_FRAME_SIZE = VBRI_OFFSET + 26;

const uint32_t XING_TAG = FROM_BIG_ENDIAN("Xing");
enum XingFlags {
  XING_HAS_NUM_FRAMES = 0x01,
  XING_HAS_NUM_BYTES = 0x02,
  XING_HAS_TOC = 0x04,
  XING_HAS_VBR_SCALE = 0x08
};

static int64_t
ParseXing(const char *aBuffer)
{
  uint32_t flags = FROM_BIG_ENDIAN(aBuffer + 4);

  if (!(flags & XING_HAS_NUM_FRAMES)) {
    NS_WARNING("VBR file without frame count. Duration estimation likely to "
               "be totally wrong.");
    return -1;
  }

  int64_t numFrames = -1;
  if (flags & XING_HAS_NUM_FRAMES) {
    numFrames = FROM_BIG_ENDIAN(aBuffer + 8);
  }

  return numFrames;
}

static int64_t
FindNumVBRFrames(const nsAutoCString& aFrame)
{
  const char *buffer = aFrame.get();
  const char *bufferEnd = aFrame.get() + aFrame.Length();

  // VBRI header is nice and well-defined; let's try to find that first.
  if (aFrame.Length() > VBRI_MIN_FRAME_SIZE &&
      FROM_BIG_ENDIAN(buffer + VBRI_OFFSET) == VBRI_TAG) {
    return FROM_BIG_ENDIAN(buffer + VBRI_FRAME_COUNT_OFFSET);
  }

  // We have to search for the Xing header as its position can change.
  for (; buffer + sizeof(XING_TAG) < bufferEnd; buffer++) {
    if (FROM_BIG_ENDIAN(buffer) == XING_TAG) {
      return ParseXing(buffer);
    }
  }

  return -1;
}


/** MP3FrameParser methods **/

// Some MP3's have large ID3v2 tags, up to 150KB, so we allow lots of
// skipped bytes to be read, just in case, before we give up and assume
// we're not parsing an MP3 stream.
static const uint32_t MAX_SKIPPED_BYTES = 4096;

enum {
  MP3_HEADER_LENGTH   = 4,
};

MP3FrameParser::MP3FrameParser(int64_t aLength)
: mLock("MP3FrameParser.mLock"),
  mTotalID3Size(0),
  mTotalFrameSize(0),
  mFrameCount(0),
  mOffset(0),
  mLength(aLength),
  mMP3Offset(-1),
  mSamplesPerSecond(0),
  mFirstFrameEnd(-1),
  mIsMP3(MAYBE_MP3)
{ }

nsresult MP3FrameParser::ParseBuffer(const uint8_t* aBuffer,
                                     uint32_t aLength,
                                     int64_t aStreamOffset,
                                     uint32_t* aOutBytesRead)
{
  // Iterate forwards over the buffer, looking for ID3 tag, or MP3
  // Frame headers.
  const uint8_t *buffer = aBuffer;
  const uint8_t *bufferEnd = aBuffer + aLength;

  // If we haven't found any MP3 frame data yet, there might be ID3 headers
  // we can skip over.
  if (mMP3Offset < 0) {
    for (const uint8_t *ch = buffer; ch < bufferEnd; ch++) {
      if (mID3Parser.ParseChar(*ch)) {
        // Found an ID3 header. We don't care about the body of the header, so
        // just skip past.
        buffer = ch + mID3Parser.GetHeaderLength() - (ID3_HEADER_LENGTH - 1);

        if (buffer <= ch) {
          return NS_ERROR_FAILURE;
        }

        ch = buffer;

        mTotalID3Size += mID3Parser.GetHeaderLength();

        // Yes, this is an MP3!
        mIsMP3 = DEFINITELY_MP3;

        mID3Parser.Reset();
      }
    }
  }

  // The first MP3 frame in a variable bitrate stream can contain metadata
  // for duration estimation and seeking, so we buffer that first frame here.
  if (aStreamOffset < mFirstFrameEnd) {
    uint64_t copyLen = std::min((int64_t)aLength, mFirstFrameEnd - aStreamOffset);
    mFirstFrame.Append((const char *)buffer, copyLen);
    buffer += copyLen;
  }

  while (buffer < bufferEnd) {
    uint16_t frameLen = mMP3Parser.ParseFrameLength(*buffer);

    if (frameLen) {
      // We've found an MP3 frame!
      // This is the first frame (and the only one we'll bother parsing), so:
      // * Mark this stream as MP3;
      // * Store the offset at which the MP3 data started; and
      // * Start buffering the frame, as it might contain handy metadata.

      // We're now sure this is an MP3 stream.
      mIsMP3 = DEFINITELY_MP3;

      // We need to know these to convert the number of frames in the stream
      // to the length of the stream in seconds.
      mSamplesPerSecond = mMP3Parser.GetSampleRate();
      mSamplesPerFrame = mMP3Parser.GetSamplesPerFrame();

      // If the stream has a constant bitrate, we should only need the length
      // of the first frame and the length (in bytes) of the stream to
      // estimate the length (in seconds).
      mTotalFrameSize += frameLen;
      mFrameCount++;

      // If |mMP3Offset| isn't set then this is the first MP3 frame we have
      // seen in the stream, which is useful for duration estimation.
      if (mMP3Offset > -1) {
        uint16_t skip = frameLen - sizeof(MP3Frame);
        buffer += skip ? skip : 1;
        continue;
      }

      // Remember the offset of the MP3 stream.
      // We're at the last byte of an MP3Frame, so MP3 data started
      // sizeof(MP3Frame) - 1 bytes ago.
      mMP3Offset = aStreamOffset
        + (buffer - aBuffer)
        - (sizeof(MP3Frame) - 1);

      buffer++;

      // If the stream has a variable bitrate, the first frame has metadata
      // we need for duration estimation and seeking. Start buffering it so we
      // can parse it later.
      mFirstFrameEnd = mMP3Offset + frameLen;
      uint64_t currOffset = buffer - aBuffer + aStreamOffset;
      uint64_t copyLen = std::min(mFirstFrameEnd - currOffset,
                                  (uint64_t)(bufferEnd - buffer));
      mFirstFrame.Append((const char *)buffer, copyLen);

      buffer += copyLen;

    } else {
      // Nothing to see here. Move along.
      buffer++;
    }
  }

  *aOutBytesRead = buffer - aBuffer;

  if (mFirstFrameEnd > -1 && mFirstFrameEnd <= aStreamOffset + buffer - aBuffer) {
    // We have our whole first frame. Try to find a VBR header.
    mNumFrames = FindNumVBRFrames(mFirstFrame);
    mFirstFrameEnd = -1;
  }

  return NS_OK;
}

void MP3FrameParser::Parse(const uint8_t* aBuffer, uint32_t aLength, uint64_t aOffset)
{
  MutexAutoLock mon(mLock);

  if (HasExactDuration()) {
    // We know the duration; nothing to do here.
    return;
  }

  const uint8_t* buffer = aBuffer;
  int32_t length = aLength;
  uint64_t offset = aOffset;

  // Got some data we have seen already. Skip forward to what we need.
  if (aOffset < mOffset) {
    buffer += mOffset - aOffset;
    length -= mOffset - aOffset;
    offset = mOffset;

    if (length <= 0) {
      return;
    }
  }

  // If there is a discontinuity in the input stream, reset the state of the
  // parsers so we don't get any partial headers.
  if (mOffset < aOffset) {
    if (!mID3Parser.IsParsed()) {
      // Only reset this if it hasn't finished yet.
      mID3Parser.Reset();
    }

    if (mFirstFrameEnd > -1) {
      NS_WARNING("Discontinuity in input while buffering first frame.");
      mFirstFrameEnd = -1;
    }

    mMP3Parser.Reset();
  }

  uint32_t bytesRead = 0;
  if (NS_FAILED(ParseBuffer(buffer,
                            length,
                            offset,
                            &bytesRead))) {
    return;
  }

  MOZ_ASSERT(length <= (int)bytesRead, "All bytes should have been consumed");

  // Update next data offset
  mOffset = offset + bytesRead;

  // If we've parsed lots of data and we still have nothing, just give up.
  // We don't count ID3 headers towards the skipped bytes count, as MP3 files
  // can have massive ID3 sections.
  if (!mID3Parser.IsParsed() && mMP3Offset < 0 &&
      mOffset - mTotalID3Size > MAX_SKIPPED_BYTES) {
    mIsMP3 = NOT_MP3;
  }
}

int64_t MP3FrameParser::GetDuration()
{
  MutexAutoLock mon(mLock);

  if (!ParsedHeaders() || !mSamplesPerSecond) {
    // Not a single frame decoded yet.
    return -1;
  }

  MOZ_ASSERT(mFrameCount > 0 && mTotalFrameSize > 0,
             "Frame parser should have seen at least one MP3 frame of positive length.");

  if (!mFrameCount || !mTotalFrameSize) {
    // This should never happen.
    return -1;
  }

  double frames;
  if (mNumFrames < 0) {
    // Estimate the number of frames in the stream based on the average frame
    // size and the length of the MP3 file.
    double frameSize = (double)mTotalFrameSize / mFrameCount;
    frames = (double)(mLength - mMP3Offset) / frameSize;
  } else {
    // We know the exact number of frames from the VBR header.
    frames = mNumFrames;
  }

  // The duration of each frame is constant over a given stream.
  double usPerFrame = USECS_PER_S * mSamplesPerFrame / mSamplesPerSecond;

  return frames * usPerFrame;
}

int64_t MP3FrameParser::GetMP3Offset()
{
  MutexAutoLock mon(mLock);
  return mMP3Offset;
}

bool MP3FrameParser::ParsedHeaders()
{
  // We have seen both the beginning and the end of the first MP3 frame in the
  // stream.
  return mMP3Offset > -1 && mFirstFrameEnd < 0;
}

bool MP3FrameParser::HasExactDuration()
{
  return ParsedHeaders() && mNumFrames > -1;
}

bool MP3FrameParser::NeedsData()
{
  // If we don't know the duration exactly then either:
  //  - we're still waiting for a VBR header; or
  //  - we look at all frames to constantly update our duration estimate.
  return IsMP3() && !HasExactDuration();
}

}