Bug 1823953 - Loop over avcodec_receive_frame when decoding using FFmpegAudioDecoder. r=alwu,media-playback-reviewers

It seems to work without, but the officiel ffmpeg command line utility and the example for their internal library all do this. This also splits the big DoDecode function into three sub-function for clarity, as it was a bit hard with the #if statements. Differential Revision: https://phabricator.services.mozilla.com/D173460
2025-01-06 00:10:25 +00:00 · 2023-08-08 12:12:40 +00:00 · 2023-08-08 12:12:40 +00:00 · 0211d2ff14
commit 0211d2ff14
parent cb94ba55d5
5 changed files with 177 additions and 156 deletions
--- a/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp
+++ b/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp
@ -29,32 +29,13 @@ FFmpegAudioDecoder<LIBAV_VER>::FFmpegAudioDecoder(FFmpegLibWrapper* aLib,
    // Ffmpeg expects the DecoderConfigDescriptor blob.
    mExtraData->AppendElements(
        *aacCodecSpecificData.mDecoderConfigDescriptorBinaryBlob);
-    mEncoderDelay = aacCodecSpecificData.mEncoderDelayFrames;
-    mEncoderPaddingOrTotalFrames = aacCodecSpecificData.mMediaFrameCount;
-    FFMPEG_LOG("FFmpegAudioDecoder (aac), found encoder delay (%" PRIu32
-               ") and total frame count (%" PRIu64
-               ") in codec-specific side data",
-               mEncoderDelay, TotalFrames());
+    FFMPEG_LOG("FFmpegAudioDecoder ctor (aac)");
    return;
  }

  if (mCodecID == AV_CODEC_ID_MP3) {
-    // Downgraded from diagnostic assert due to BMO 1776524 on Android.
-    MOZ_ASSERT(aConfig.mCodecSpecificConfig.is<Mp3CodecSpecificData>());
-    // Gracefully handle bad data. If don't hit the preceding assert once this
-    // has been shipped for awhile, we can remove it and make the following code
-    // non-conditional.
-    if (aConfig.mCodecSpecificConfig.is<Mp3CodecSpecificData>()) {
-      const Mp3CodecSpecificData& mp3CodecSpecificData =
-          aConfig.mCodecSpecificConfig.as<Mp3CodecSpecificData>();
-      mEncoderDelay = mp3CodecSpecificData.mEncoderDelayFrames;
-      mEncoderPaddingOrTotalFrames = mp3CodecSpecificData.mEncoderPaddingFrames;
-      FFMPEG_LOG("FFmpegAudioDecoder (mp3), found encoder delay (%" PRIu32
-                 ")"
-                 "and padding values (%" PRIu64 ") in codec-specific side-data",
-                 mEncoderDelay, Padding());
-      return;
-    }
+    // Nothing to do
+    return;
  }

  if (mCodecID == AV_CODEC_ID_FLAC) {
@ -108,11 +89,21 @@ void FFmpegAudioDecoder<LIBAV_VER>::InitCodecContext() {
  // FFmpeg takes this as a suggestion for what format to use for audio samples.
  // LibAV 0.8 produces rubbish float interleaved samples, request 16 bits
  // audio.
-#ifdef MOZ_SAMPLE_TYPE_S16
-  mCodecContext->request_sample_fmt = AV_SAMPLE_FMT_S16;
-#else
  mCodecContext->request_sample_fmt =
      (mLib->mVersion == 53) ? AV_SAMPLE_FMT_S16 : AV_SAMPLE_FMT_FLT;
+#ifdef FFVPX_VERSION
+  // AudioInfo's layout first 32-bits are bit-per-bit compatible with
+  // WAVEFORMATEXTENSIBLE and FFmpeg's AVChannel enum. We can cast here.
+  mCodecContext->ch_layout.nb_channels =
+      AssertedCast<int>(mAudioInfo.mChannels);
+  if (mAudioInfo.mChannelMap != AudioConfig::ChannelLayout::UNKNOWN_MAP) {
+    mLib->av_channel_layout_from_mask(
+        &mCodecContext->ch_layout, AssertedCast<uint64_t>(mAudioInfo.mChannelMap));
+  } else {
+    mLib->av_channel_layout_default(&mCodecContext->ch_layout,
+                                    AssertedCast<int>(mAudioInfo.mChannels));
+  }
+  mCodecContext->sample_rate = AssertedCast<int>(mAudioInfo.mRate);
 #endif
 }

@ -239,14 +230,147 @@ static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,

 using ChannelLayout = AudioConfig::ChannelLayout;

-uint64_t FFmpegAudioDecoder<LIBAV_VER>::Padding() const {
-  MOZ_ASSERT(mCodecID == AV_CODEC_ID_MP3);
-  return mEncoderPaddingOrTotalFrames;
+MediaResult FFmpegAudioDecoder<LIBAV_VER>::PostProcessOutput(
+    bool aDecoded, MediaRawData* aSample, DecodedData& aResults,
+    bool* aGotFrame, size_t aSamplePositionOffset) {
+  int64_t samplePosition =
+      AssertedCast<int64_t>(aSample->mOffset + aSamplePositionOffset);
+  media::TimeUnit pts = aSample->mTime;
+
+  if (mFrame->format != AV_SAMPLE_FMT_FLT &&
+      mFrame->format != AV_SAMPLE_FMT_FLTP &&
+      mFrame->format != AV_SAMPLE_FMT_S16 &&
+      mFrame->format != AV_SAMPLE_FMT_S16P &&
+      mFrame->format != AV_SAMPLE_FMT_S32 &&
+      mFrame->format != AV_SAMPLE_FMT_S32P) {
+    return MediaResult(
+        NS_ERROR_DOM_MEDIA_DECODE_ERR,
+        RESULT_DETAIL("FFmpeg audio decoder outputs unsupported audio format"));
+  }
+
+  FFMPEG_LOG(
+      "FFmpegAudioDecoder decoded: %zu bytes, [%s,%s] (Duration: %s) [%s]",
+      aSamplePositionOffset, aSample->mTime.ToString().get(),
+      aSample->GetEndTime().ToString().get(),
+      aSample->mDuration.ToString().get(),
+      mLib->av_get_sample_fmt_name(mFrame->format));
+
+  uint32_t numChannels = mCodecContext->channels;
+  uint32_t samplingRate = mCodecContext->sample_rate;
+
+  AlignedAudioBuffer audio =
+      CopyAndPackAudio(mFrame, numChannels, mFrame->nb_samples);
+  if (!audio) {
+    FFMPEG_LOG("CopyAndPackAudio error (OOM)");
+    return MediaResult(NS_ERROR_OUT_OF_MEMORY, __func__);
+  }
+
+  media::TimeUnit duration = TimeUnit(mFrame->nb_samples, samplingRate);
+  if (!duration.IsValid()) {
+    FFMPEG_LOG("Duration isn't valid (%d + %d)", mFrame->nb_samples,
+               samplingRate);
+    return MediaResult(NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
+                       RESULT_DETAIL("Invalid sample duration"));
+  }
+
+  media::TimeUnit newpts = pts + duration;
+  if (!newpts.IsValid()) {
+    FFMPEG_LOG("New pts isn't valid (%lf + %lf)", pts.ToSeconds(),
+               duration.ToSeconds());
+    return MediaResult(
+        NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
+        RESULT_DETAIL("Invalid count of accumulated audio samples"));
+  }
+
+  RefPtr<AudioData> data =
+      new AudioData(samplePosition, pts, std::move(audio), numChannels,
+                    samplingRate, mCodecContext->channel_layout);
+  MOZ_ASSERT(duration == data->mDuration, "must be equal");
+  aResults.AppendElement(std::move(data));
+
+  pts = newpts;
+
+  if (aGotFrame) {
+    *aGotFrame = true;
+  }
+  return NS_OK;
 }
-uint64_t FFmpegAudioDecoder<LIBAV_VER>::TotalFrames() const {
-  MOZ_ASSERT(mCodecID == AV_CODEC_ID_AAC);
-  return mEncoderPaddingOrTotalFrames;
+
+#if LIBAVCODEC_VERSION_MAJOR < 59
+MediaResult FFmpegAudioDecoder<LIBAV_VER>::DecodeUsingFFmpeg(
+    AVPacket* aPacket, int& aOutBytesConsumed, bool& aDecoded,
+    MediaRawData* aSample, DecodedData& aResults, bool* aGotFrame) {
+  int decoded = 0;
+  int rv =
+      mLib->avcodec_decode_audio4(mCodecContext, mFrame, &decoded, aPacket);
+  aDecoded = decoded == 1;
+  if (rv < 0) {
+    NS_WARNING("FFmpeg audio decoder error.");
+    return MediaResult(
+        NS_ERROR_DOM_MEDIA_DECODE_ERR,
+        RESULT_DETAIL("FFmpeg audio error:%d", aOutBytesConsumed));
+  }
+  PostProcessOutput(decoded, aSample, aResults, aGotFrame, aOutBytesConsumed);
+  aOutBytesConsumed += rv;
+  return NS_OK;
 }
+#else
+#  define AVRESULT_OK 0
+
+MediaResult FFmpegAudioDecoder<LIBAV_VER>::DecodeUsingFFmpeg(
+    AVPacket* aPacket, int& aOutBytesConsumed, bool& aDecoded,
+    MediaRawData* aSample, DecodedData& aResults, bool* aGotFrame) {
+  int ret = mLib->avcodec_send_packet(mCodecContext, aPacket);
+  int consumed = 0;
+  switch (ret) {
+    case AVRESULT_OK:
+      consumed = aPacket->size;
+      break;
+    case AVERROR(EAGAIN):
+      FFMPEG_LOG("  av_codec_send_packet: EAGAIN.");
+      break;
+    case AVERROR_EOF:
+      FFMPEG_LOG("  End of stream.");
+      return MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
+                         RESULT_DETAIL("End of stream"));
+    default:
+      NS_WARNING("FFmpeg audio decoder error (avcodec_send_packet).");
+      return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
+                         RESULT_DETAIL("FFmpeg audio error"));
+  }
+
+  while (ret >= 0) {
+    ret = mLib->avcodec_receive_frame(mCodecContext, mFrame);
+    switch (ret) {
+      case AVRESULT_OK:
+        aDecoded = true;
+        aOutBytesConsumed += consumed;
+        break;
+      case AVERROR(EAGAIN):
+        aOutBytesConsumed += consumed;
+        FFMPEG_LOG("  EAGAIN.");
+        break;
+      case AVERROR_EOF: {
+        FFMPEG_LOG("  End of stream.");
+        aOutBytesConsumed += consumed;
+        return MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
+                           RESULT_DETAIL("End of stream"));
+        default:
+          FFMPEG_LOG("  avcodec_receive_packet error.");
+          NS_WARNING("FFmpeg audio decoder error (avcodec_receive_packet).");
+          return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
+                             RESULT_DETAIL("FFmpeg audio error"));
+      }
+    }
+    if (aDecoded) {
+      PostProcessOutput(aDecoded, aSample, aResults, aGotFrame,
+                        aOutBytesConsumed);
+    }
+  }
+
+  return NS_OK;
+}
+#endif

 MediaResult FFmpegAudioDecoder<LIBAV_VER>::DoDecode(MediaRawData* aSample,
                                                    uint8_t* aData, int aSize,
@ -257,6 +381,11 @@ MediaResult FFmpegAudioDecoder<LIBAV_VER>::DoDecode(MediaRawData* aSample,
  AVPacket packet;
  mLib->av_init_packet(&packet);

+  FFMPEG_LOG("FFmpegAudioDecoder::DoDecode: %d bytes, [%s,%s] (Duration: %s)",
+             aSize, aSample->mTime.ToString().get(),
+             aSample->GetEndTime().ToString().get(),
+             aSample->mDuration.ToString().get());
+
  packet.data = const_cast<uint8_t*>(aData);
  packet.size = aSize;

@ -271,115 +400,16 @@ MediaResult FFmpegAudioDecoder<LIBAV_VER>::DoDecode(MediaRawData* aSample,
        RESULT_DETAIL("FFmpeg audio decoder failed to allocate frame"));
  }

-  int64_t samplePosition = aSample->mOffset;
-
  while (packet.size > 0) {
-    int decoded = false;
-    int bytesConsumed = -1;
-#if LIBAVCODEC_VERSION_MAJOR < 59
-    bytesConsumed =
-        mLib->avcodec_decode_audio4(mCodecContext, mFrame, &decoded, &packet);
-    if (bytesConsumed < 0) {
-      NS_WARNING("FFmpeg audio decoder error.");
-      return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
-                         RESULT_DETAIL("FFmpeg audio error:%d", bytesConsumed));
-    }
-#else
-#  define AVRESULT_OK 0
-    int ret = mLib->avcodec_send_packet(mCodecContext, &packet);
-    switch (ret) {
-      case AVRESULT_OK:
-        bytesConsumed = packet.size;
-        break;
-      case AVERROR(EAGAIN):
-        break;
-      case AVERROR_EOF:
-        FFMPEG_LOG("  End of stream.");
-        return MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
-                           RESULT_DETAIL("End of stream"));
-      default:
-        NS_WARNING("FFmpeg audio decoder error.");
-        return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
-                           RESULT_DETAIL("FFmpeg audio error"));
-    }
-
-    ret = mLib->avcodec_receive_frame(mCodecContext, mFrame);
-    switch (ret) {
-      case AVRESULT_OK:
-        decoded = true;
-        break;
-      case AVERROR(EAGAIN):
-        break;
-      case AVERROR_EOF: {
-        FFMPEG_LOG("  End of stream.");
-        return MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
-                           RESULT_DETAIL("End of stream"));
-      }
-    }
-#endif
-
-    if (decoded) {
-      if (mFrame->format != AV_SAMPLE_FMT_FLT &&
-          mFrame->format != AV_SAMPLE_FMT_FLTP &&
-          mFrame->format != AV_SAMPLE_FMT_S16 &&
-          mFrame->format != AV_SAMPLE_FMT_S16P &&
-          mFrame->format != AV_SAMPLE_FMT_S32 &&
-          mFrame->format != AV_SAMPLE_FMT_S32P) {
-        return MediaResult(
-            NS_ERROR_DOM_MEDIA_DECODE_ERR,
-            RESULT_DETAIL(
-                "FFmpeg audio decoder outputs unsupported audio format"));
-      }
-      uint32_t numChannels = mCodecContext->channels;
-      uint32_t samplingRate = mCodecContext->sample_rate;
-
-      AlignedAudioBuffer audio =
-          CopyAndPackAudio(mFrame, numChannels, mFrame->nb_samples);
-      if (!audio) {
-        FFMPEG_LOG("FFmpegAudioDecoder: OOM");
-        return MediaResult(NS_ERROR_OUT_OF_MEMORY, __func__);
-      }
-
-      FFMPEG_LOG("Packet decoded: [%s, %s] (%" PRId64 "us, %d frames)",
-                 aSample->mTime.ToString().get(),
-                 aSample->GetEndTime().ToString().get(),
-                 aSample->mDuration.ToMicroseconds(), mFrame->nb_samples);
-
-      media::TimeUnit duration = TimeUnit(mFrame->nb_samples, samplingRate);
-      if (!duration.IsValid()) {
-        FFMPEG_LOG("FFmpegAudioDecoder: invalid duration");
-        return MediaResult(NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
-                           RESULT_DETAIL("Invalid sample duration"));
-      }
-
-      media::TimeUnit pts = aSample->mTime;
-      media::TimeUnit newpts = pts + duration;
-      if (!newpts.IsValid()) {
-        FFMPEG_LOG("FFmpegAudioDecoder: invalid PTS.");
-        return MediaResult(
-            NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
-            RESULT_DETAIL("Invalid count of accumulated audio samples"));
-      }
-
-      RefPtr<AudioData> data =
-          new AudioData(samplePosition, pts, std::move(audio), numChannels,
-                        samplingRate, mCodecContext->channel_layout);
-      MOZ_ASSERT(duration == data->mDuration, "must be equal");
-      aResults.AppendElement(std::move(data));
-
-      pts = newpts;
-
-      if (aGotFrame) {
-        *aGotFrame = true;
-      }
-    }
-    // The packet wasn't sent to ffmpeg, another attempt will happen next
+    bool decoded = false;
+    int bytesConsumed = 0;
+    auto rv = DecodeUsingFFmpeg(&packet, bytesConsumed, decoded, aSample,
+                                aResults, aGotFrame);
+    NS_ENSURE_SUCCESS(rv, rv);
+    // If the packet wasn't sent to ffmpeg, another attempt will happen next
    // iteration.
-    if (bytesConsumed != -1) {
-      packet.data += bytesConsumed;
-      packet.size -= bytesConsumed;
-      samplePosition += bytesConsumed;
-    }
+    packet.data += bytesConsumed;
+    packet.size -= bytesConsumed;
  }
  return NS_OK;
 }
--- a/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.h
+++ b/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.h
@ -43,21 +43,12 @@ class FFmpegAudioDecoder<LIBAV_VER>
 private:
  MediaResult DoDecode(MediaRawData* aSample, uint8_t* aData, int aSize,
                       bool* aGotFrame, DecodedData& aResults) override;
-  // This method is to be called only when decoding mp3, in order to correctly
-  // discard padding frames.
-  uint64_t Padding() const;
-  // This method is to be called only when decoding AAC, in order to correctly
-  // discard padding frames, based on the number of frames decoded and the total
-  // frame count of the media.
-  uint64_t TotalFrames() const;
-  // The number of frames of encoder delay, that need to be discarded at the
-  // beginning of the stream.
-  uint32_t mEncoderDelay = 0;
-  // This holds either the encoder padding (when this decoder decodes mp3), or
-  // the total frame count of the media (when this decoder decodes AAC).
-  // It is best accessed via the `Padding` and `TotalFrames` methods, for
-  // clarity.
-  uint64_t mEncoderPaddingOrTotalFrames = 0;
+  MediaResult DecodeUsingFFmpeg(AVPacket* aPacket, int& aOutBytesConsumed,
+                                bool& aDecoded, MediaRawData* aSample,
+                                DecodedData& aResults, bool* aGotFrame);
+  MediaResult PostProcessOutput(bool aDecoded, MediaRawData* aSample,
+                                DecodedData& aResults, bool* aGotFrame,
+                                size_t aSamplePositionOffset);
 };

 }  // namespace mozilla
--- a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
+++ b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
@ -199,6 +199,7 @@ FFmpegLibWrapper::LinkResult FFmpegLibWrapper::Link() {
  AV_FUNC(avcodec_descriptor_get, AV_FUNC_53 | AV_FUNC_55 | AV_FUNC_56 |
                                      AV_FUNC_57 | AV_FUNC_58 | AV_FUNC_59 |
                                      AV_FUNC_60)
+  AV_FUNC(av_get_sample_fmt_name, AV_FUNC_AVUTIL_ALL);

 #ifdef MOZ_WIDGET_GTK
  AV_FUNC_OPTION_SILENT(avcodec_get_hw_config,
--- a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
+++ b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
@ -122,6 +122,7 @@ struct MOZ_ONLY_USED_TO_AVOID_STATIC_CONSTRUCTORS FFmpegLibWrapper {
                             void* log_ctx);
  int (*av_image_get_buffer_size)(int pix_fmt, int width, int height,
                                  int align);
+  const char* (*av_get_sample_fmt_name)(int sample_fmt);

  // libavutil v55 and later only
  AVFrame* (*av_frame_alloc)();
--- a/media/ffvpx/libavutil/avutil.symbols
+++ b/media/ffvpx/libavutil/avutil.symbols
@ -149,7 +149,6 @@ av_get_pix_fmt_name
 av_get_pix_fmt_string
 av_get_planar_sample_fmt
 av_get_sample_fmt
-av_get_sample_fmt_name
 av_get_sample_fmt_string
 av_get_standard_channel_layout
 av_get_time_base_q
@ -326,5 +325,4 @@ av_hwdevice_ctx_init
 av_hwframe_transfer_get_formats
 av_hwdevice_ctx_create_derived
 av_malloc_array
-
-
+av_get_sample_fmt_name