From de0e63f0ea5825f375f36d8b676d7e502d2acbc4 Mon Sep 17 00:00:00 2001
From: Andreas Pehrson <apehrson@mozilla.com>
Date: Wed, 31 Jul 2019 11:29:34 +0000
Subject: [PATCH] Bug 1014393 - Break out muxing and queueing of track data
 from MediaEncoder to new Muxer class. r=bryce

This first of all does some refactoring of how metadata is encoded in
MediaEncoder. This is now guided by the new Muxer class. If we're ready to pass
data to the muxer and it does not have metadata yet, we provide metadata before
giving it any media data. This metadata is passed to the muxer in a single call.
The metadata provided in this call must stay valid for the entire recording.
This removes MediaEncoder::GetEncodedMetadata().

This also removes the ctor argument from the WebMWriter since it can now rely on
the single SetMetadata() instead.
To comply with the ContainerWriter::SetMetadata() docs,
WebMWriter::SetMetadata() will now also sanity check metadata.

ContainerWriter instances are updated somewhat, to accommodate these changes.

Lastly, and most important, the new Muxer class manages muxing of the (up to)
two tracks into a single container, ensuring that timestamps increase
monotonically throughout a recording.

Differential Revision: https://phabricator.services.mozilla.com/D35306

--HG--
extra : moz-landing-system : lando
---
 dom/media/MediaRecorder.cpp         |  54 ++---
 dom/media/encoder/ContainerWriter.h |  12 +-
 dom/media/encoder/MediaEncoder.cpp  | 349 ++++++----------------------
 dom/media/encoder/MediaEncoder.h    |  62 ++---
 dom/media/encoder/Muxer.cpp         | 228 ++++++++++++++++++
 dom/media/encoder/Muxer.h           |  74 ++++++
 dom/media/encoder/moz.build         |   1 +
 dom/media/gtest/TestWebMWriter.cpp  |  96 ++++----
 dom/media/ogg/OggWriter.cpp         |  24 +-
 dom/media/ogg/OggWriter.h           |   3 +-
 dom/media/webm/WebMWriter.cpp       |  92 +++++---
 dom/media/webm/WebMWriter.h         |  12 +-
 12 files changed, 548 insertions(+), 459 deletions(-)
 create mode 100644 dom/media/encoder/Muxer.cpp
 create mode 100644 dom/media/encoder/Muxer.h
diff --git a/dom/media/MediaRecorder.cpp b/dom/media/MediaRecorder.cpp
index a2cfe84016ca..36883f7c8a92 100644
--- a/dom/media/MediaRecorder.cpp
+++ b/dom/media/MediaRecorder.cpp
@@ -1039,46 +1039,32 @@ class MediaRecorder::Session : public PrincipalChangeObserver<MediaStreamTrack>,
   void MediaEncoderInitialized() {
     MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
 
-    // Pull encoded metadata from MediaEncoder
-    nsTArray<nsTArray<uint8_t>> encodedBuf;
-    nsString mime;
-    nsresult rv = mEncoder->GetEncodedMetadata(&encodedBuf, mime);
-
-    if (NS_FAILED(rv)) {
-      MOZ_ASSERT(false);
-      return;
-    }
-
-    // Append pulled data into cache buffer.
-    NS_DispatchToMainThread(
-        new StoreEncodedBufferRunnable(this, std::move(encodedBuf)));
-
-    RefPtr<Session> self = this;
-    NS_DispatchToMainThread(NewRunnableFrom([self, mime]() {
-      if (!self->mRecorder) {
-        MOZ_ASSERT_UNREACHABLE("Recorder should be live");
+    NS_DispatchToMainThread(NewRunnableFrom([self = RefPtr<Session>(this), this,
+                                             mime = mEncoder->MimeType()]() {
+      if (mRunningState.isErr()) {
         return NS_OK;
       }
-      if (self->mRunningState.isOk()) {
-        auto state = self->mRunningState.unwrap();
-        if (state == RunningState::Starting ||
-            state == RunningState::Stopping) {
-          if (state == RunningState::Starting) {
-            // We set it to Running in the runnable since we can only assign
-            // mRunningState on main thread. We set it before running the start
-            // event runnable since that dispatches synchronously (and may cause
-            // js calls to methods depending on mRunningState).
-            self->mRunningState = RunningState::Running;
-          }
-          self->mMimeType = mime;
-          self->mRecorder->SetMimeType(self->mMimeType);
-          auto startEvent = MakeRefPtr<DispatchEventRunnable>(
-              self, NS_LITERAL_STRING("start"));
-          startEvent->Run();
+      mMimeType = mime;
+      mRecorder->SetMimeType(mime);
+      auto state = mRunningState.unwrap();
+      if (state == RunningState::Starting || state == RunningState::Stopping) {
+        if (!self->mRecorder) {
+          MOZ_ASSERT_UNREACHABLE("Recorder should be live");
+          return NS_OK;
         }
+        if (state == RunningState::Starting) {
+          // We set it to Running in the runnable since we can only assign
+          // mRunningState on main thread. We set it before running the start
+          // event runnable since that dispatches synchronously (and may cause
+          // js calls to methods depending on mRunningState).
+          mRunningState = RunningState::Running;
+        }
+        mRecorder->DispatchSimpleEvent(NS_LITERAL_STRING("start"));
       }
       return NS_OK;
     }));
+
+    Extract(false, nullptr);
   }
 
   void MediaEncoderDataAvailable() {
diff --git a/dom/media/encoder/ContainerWriter.h b/dom/media/encoder/ContainerWriter.h
index b8c0a9f08893..724c8b90c961 100644
--- a/dom/media/encoder/ContainerWriter.h
+++ b/dom/media/encoder/ContainerWriter.h
@@ -37,13 +37,15 @@ class ContainerWriter {
       const nsTArray<RefPtr<EncodedFrame>>& aData, uint32_t aFlags = 0) = 0;
 
   /**
-   * Set the meta data pointer into muxer
-   * This function will check the integrity of aMetadata.
-   * If the meta data isn't well format, this function will return
-   * NS_ERROR_FAILURE to caller, else save the pointer to mMetadata and return
+   * Stores the metadata for all given tracks to the muxer.
+   *
+   * This method checks the integrity of aMetadata.
+   * If the metadata isn't well formatted, this method returns NS_ERROR_FAILURE.
+   * If the metadata is well formatted, it stores the metadata and returns
    * NS_OK.
    */
-  virtual nsresult SetMetadata(TrackMetadataBase* aMetadata) = 0;
+  virtual nsresult SetMetadata(
+      const nsTArray<RefPtr<TrackMetadataBase>>& aMetadata) = 0;
 
   /**
    * Indicate if the writer has finished to output data
diff --git a/dom/media/encoder/MediaEncoder.cpp b/dom/media/encoder/MediaEncoder.cpp
index 5f155117f8f0..e046ba092f99 100644
--- a/dom/media/encoder/MediaEncoder.cpp
+++ b/dom/media/encoder/MediaEncoder.cpp
@@ -25,6 +25,7 @@
 #include "mozilla/StaticPtr.h"
 #include "mozilla/TaskQueue.h"
 #include "mozilla/Unused.h"
+#include "Muxer.h"
 #include "nsIPrincipal.h"
 #include "nsMimeTypes.h"
 #include "nsThreadUtils.h"
@@ -398,14 +399,13 @@ MediaEncoder::MediaEncoder(TaskQueue* aEncoderThread,
                            VideoTrackEncoder* aVideoEncoder,
                            TrackRate aTrackRate, const nsAString& aMIMEType)
     : mEncoderThread(aEncoderThread),
-      mWriter(std::move(aWriter)),
+      mMuxer(MakeUnique<Muxer>(std::move(aWriter))),
       mAudioEncoder(aAudioEncoder),
       mVideoEncoder(aVideoEncoder),
       mEncoderListener(MakeAndAddRef<EncoderListener>(mEncoderThread, this)),
       mStartTime(TimeStamp::Now()),
       mMIMEType(aMIMEType),
       mInitialized(false),
-      mMetadataEncoded(false),
       mCompleted(false),
       mError(false),
       mCanceled(false),
@@ -651,7 +651,7 @@ already_AddRefed<MediaEncoder> MediaEncoder::CreateEncoder(
             driftCompensator, aTrackRate, FrameDroppingMode::DISALLOW);
       }
     }
-    writer = MakeUnique<WebMWriter>(aTrackTypes);
+    writer = MakeUnique<WebMWriter>();
     mimeType = NS_LITERAL_STRING(VIDEO_WEBM);
   } else if (MediaEncoder::IsWebMEncoderEnabled() &&
              aMIMEType.EqualsLiteral(AUDIO_WEBM) &&
@@ -672,7 +672,7 @@ already_AddRefed<MediaEncoder> MediaEncoder::CreateEncoder(
     } else {
       mimeType = NS_LITERAL_STRING(AUDIO_WEBM);
     }
-    writer = MakeUnique<WebMWriter>(aTrackTypes);
+    writer = MakeUnique<WebMWriter>();
   }
 #endif  // MOZ_WEBM_ENCODER
   else if (MediaDecoder::IsOggEnabled() && MediaDecoder::IsOpusEnabled() &&
@@ -699,7 +699,7 @@ already_AddRefed<MediaEncoder> MediaEncoder::CreateEncoder(
             driftCompensator, aTrackRate, FrameDroppingMode::DISALLOW);
       }
     }
-    writer = MakeUnique<WebMWriter>(aTrackTypes);
+    writer = MakeUnique<WebMWriter>();
     mimeType = NS_LITERAL_STRING(VIDEO_WEBM);
   }
 #endif  // MOZ_WEBM_ENCODER
@@ -737,109 +737,78 @@ already_AddRefed<MediaEncoder> MediaEncoder::CreateEncoder(
       audioEncoder, videoEncoder, aTrackRate, mimeType);
 }
 
-nsresult MediaEncoder::GetEncodedMetadata(
-    nsTArray<nsTArray<uint8_t>>* aOutputBufs, nsAString& aMIMEType) {
-  AUTO_PROFILER_LABEL("MediaEncoder::GetEncodedMetadata", OTHER);
-
-  MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
-
-  if (mShutdown) {
-    MOZ_ASSERT(false);
-    return NS_ERROR_FAILURE;
-  }
-
-  if (!mInitialized) {
-    MOZ_ASSERT(false);
-    return NS_ERROR_FAILURE;
-  }
-
-  if (mMetadataEncoded) {
-    MOZ_ASSERT(false);
-    return NS_ERROR_FAILURE;
-  }
-
-  aMIMEType = mMIMEType;
-
-  LOG(LogLevel::Verbose,
-      ("GetEncodedMetadata TimeStamp = %f", GetEncodeTimeStamp()));
-
-  nsresult rv;
-
-  if (mAudioEncoder) {
-    if (!mAudioEncoder->IsInitialized()) {
-      LOG(LogLevel::Error,
-          ("GetEncodedMetadata Audio encoder not initialized"));
-      MOZ_ASSERT(false);
-      return NS_ERROR_FAILURE;
-    }
-    rv = CopyMetadataToMuxer(mAudioEncoder);
-    if (NS_FAILED(rv)) {
-      LOG(LogLevel::Error, ("Failed to Set Audio Metadata"));
-      SetError();
-      return rv;
-    }
-  }
-  if (mVideoEncoder) {
-    if (!mVideoEncoder->IsInitialized()) {
-      LOG(LogLevel::Error,
-          ("GetEncodedMetadata Video encoder not initialized"));
-      MOZ_ASSERT(false);
-      return NS_ERROR_FAILURE;
-    }
-    rv = CopyMetadataToMuxer(mVideoEncoder.get());
-    if (NS_FAILED(rv)) {
-      LOG(LogLevel::Error, ("Failed to Set Video Metadata"));
-      SetError();
-      return rv;
-    }
-  }
-
-  rv = mWriter->GetContainerData(aOutputBufs, ContainerWriter::GET_HEADER);
-  if (NS_FAILED(rv)) {
-    LOG(LogLevel::Error, ("Writer fail to generate header!"));
-    SetError();
-    return rv;
-  }
-  LOG(LogLevel::Verbose,
-      ("Finish GetEncodedMetadata TimeStamp = %f", GetEncodeTimeStamp()));
-  mMetadataEncoded = true;
-
-  return NS_OK;
-}
-
 nsresult MediaEncoder::GetEncodedData(
     nsTArray<nsTArray<uint8_t>>* aOutputBufs) {
   AUTO_PROFILER_LABEL("MediaEncoder::GetEncodedData", OTHER);
 
   MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
-
-  if (!mMetadataEncoded) {
-    MOZ_ASSERT(false);
-    return NS_ERROR_FAILURE;
-  }
+  MOZ_ASSERT(mInitialized);
+  MOZ_ASSERT_IF(mAudioEncoder, mAudioEncoder->IsInitialized());
+  MOZ_ASSERT_IF(mVideoEncoder, mVideoEncoder->IsInitialized());
 
   nsresult rv;
   LOG(LogLevel::Verbose,
       ("GetEncodedData TimeStamp = %f", GetEncodeTimeStamp()));
 
-  rv = EncodeData();
-  if (NS_FAILED(rv)) {
-    return rv;
+  if (mMuxer->NeedsMetadata()) {
+    nsTArray<RefPtr<TrackMetadataBase>> meta;
+    if (mAudioEncoder && !*meta.AppendElement(mAudioEncoder->GetMetadata())) {
+      LOG(LogLevel::Error, ("Audio metadata is null"));
+      SetError();
+      return NS_ERROR_ABORT;
+    }
+    if (mVideoEncoder && !*meta.AppendElement(mVideoEncoder->GetMetadata())) {
+      LOG(LogLevel::Error, ("Video metadata is null"));
+      SetError();
+      return NS_ERROR_ABORT;
+    }
+
+    rv = mMuxer->SetMetadata(meta);
+    if (NS_FAILED(rv)) {
+      LOG(LogLevel::Error, ("SetMetadata failed"));
+      SetError();
+      return rv;
+    }
   }
 
-  rv = WriteEncodedDataToMuxer();
-  if (NS_FAILED(rv)) {
-    return rv;
+  // First, feed encoded data from encoders to muxer.
+
+  if (mVideoEncoder && !mVideoEncoder->IsEncodingComplete()) {
+    nsTArray<RefPtr<EncodedFrame>> videoFrames;
+    rv = mVideoEncoder->GetEncodedTrack(videoFrames);
+    if (NS_FAILED(rv)) {
+      // Encoding might be canceled.
+      LOG(LogLevel::Error, ("Failed to get encoded data from video encoder."));
+      return rv;
+    }
+    for (const RefPtr<EncodedFrame>& frame : videoFrames) {
+      mMuxer->AddEncodedVideoFrame(frame);
+    }
+    if (mVideoEncoder->IsEncodingComplete()) {
+      mMuxer->VideoEndOfStream();
+    }
   }
 
-  // In audio only or video only case, let unavailable track's flag to be
-  // true.
-  bool isAudioCompleted = !mAudioEncoder || mAudioEncoder->IsEncodingComplete();
-  bool isVideoCompleted = !mVideoEncoder || mVideoEncoder->IsEncodingComplete();
-  rv = mWriter->GetContainerData(
-      aOutputBufs,
-      isAudioCompleted && isVideoCompleted ? ContainerWriter::FLUSH_NEEDED : 0);
-  if (mWriter->IsWritingComplete()) {
+  if (mAudioEncoder && !mAudioEncoder->IsEncodingComplete()) {
+    nsTArray<RefPtr<EncodedFrame>> audioFrames;
+    rv = mAudioEncoder->GetEncodedTrack(audioFrames);
+    if (NS_FAILED(rv)) {
+      // Encoding might be canceled.
+      LOG(LogLevel::Error, ("Failed to get encoded data from audio encoder."));
+      return rv;
+    }
+    for (const RefPtr<EncodedFrame>& frame : audioFrames) {
+      mMuxer->AddEncodedAudioFrame(frame);
+    }
+    if (mAudioEncoder->IsEncodingComplete()) {
+      mMuxer->AudioEndOfStream();
+    }
+  }
+
+  // Second, get data from muxer. This will do the actual muxing.
+
+  rv = mMuxer->GetData(aOutputBufs);
+  if (mMuxer->IsFinished()) {
     mCompleted = true;
     Shutdown();
   }
@@ -847,7 +816,9 @@ nsresult MediaEncoder::GetEncodedData(
   LOG(LogLevel::Verbose,
       ("END GetEncodedData TimeStamp=%f "
        "mCompleted=%d, aComplete=%d, vComplete=%d",
-       GetEncodeTimeStamp(), mCompleted, isAudioCompleted, isVideoCompleted));
+       GetEncodeTimeStamp(), mCompleted,
+       !mAudioEncoder || mAudioEncoder->IsEncodingComplete(),
+       !mVideoEncoder || mVideoEncoder->IsEncodingComplete()));
 
   return rv;
 }
@@ -891,189 +862,6 @@ void MediaEncoder::Shutdown() {
   }
 }
 
-nsresult MediaEncoder::EncodeData() {
-  AUTO_PROFILER_LABEL("MediaEncoder::EncodeData", OTHER);
-
-  MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
-
-  if (!mVideoEncoder && !mAudioEncoder) {
-    MOZ_ASSERT_UNREACHABLE("Must have atleast one encoder");
-    return NS_ERROR_UNEXPECTED;
-  }
-
-  if (mVideoEncoder && !mVideoEncoder->IsEncodingComplete()) {
-    nsTArray<RefPtr<EncodedFrame>> videoFrames;
-    nsresult rv = mVideoEncoder->GetEncodedTrack(videoFrames);
-    if (NS_FAILED(rv)) {
-      // Encoding might be canceled.
-      LOG(LogLevel::Error, ("Failed to get encoded data from video encoder."));
-      return rv;
-    }
-    for (const RefPtr<EncodedFrame>& frame : videoFrames) {
-      mEncodedVideoFrames.Push(frame);
-    }
-  }
-
-  if (mAudioEncoder && !mAudioEncoder->IsEncodingComplete()) {
-    nsTArray<RefPtr<EncodedFrame>> audioFrames;
-    nsresult rv = mAudioEncoder->GetEncodedTrack(audioFrames);
-    if (NS_FAILED(rv)) {
-      // Encoding might be canceled.
-      LOG(LogLevel::Error, ("Failed to get encoded data from audio encoder."));
-      return rv;
-    }
-    for (const RefPtr<EncodedFrame>& frame : audioFrames) {
-      if (frame->mFrameType == EncodedFrame::FrameType::OPUS_AUDIO_FRAME) {
-        frame->mTime += mAudioCodecDelay;
-      }
-      mEncodedAudioFrames.Push(frame);
-    }
-  }
-
-  return NS_OK;
-}
-
-nsresult MediaEncoder::WriteEncodedDataToMuxer() {
-  AUTO_PROFILER_LABEL("MediaEncoder::WriteEncodedDataToMuxer", OTHER);
-
-  MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
-
-  if (!mVideoEncoder && !mAudioEncoder) {
-    MOZ_ASSERT_UNREACHABLE("Must have atleast one encoder");
-    return NS_ERROR_UNEXPECTED;
-  }
-
-  // If we have a single encoder we don't have to worry about interleaving
-  if ((mVideoEncoder && !mAudioEncoder) || (mAudioEncoder && !mVideoEncoder)) {
-    TrackEncoder* encoder = mAudioEncoder
-                                ? static_cast<TrackEncoder*>(mAudioEncoder)
-                                : static_cast<TrackEncoder*>(mVideoEncoder);
-    MediaQueue<EncodedFrame>* encodedFramesQueue =
-        mAudioEncoder ? &mEncodedAudioFrames : &mEncodedVideoFrames;
-    nsTArray<RefPtr<EncodedFrame>> frames;
-    while (encodedFramesQueue->GetSize() > 0) {
-      frames.AppendElement(encodedFramesQueue->PopFront());
-    }
-    nsresult rv = mWriter->WriteEncodedTrack(
-        frames,
-        encoder->IsEncodingComplete() ? ContainerWriter::END_OF_STREAM : 0);
-    if (NS_FAILED(rv)) {
-      LOG(LogLevel::Error,
-          ("Failed to write encoded video track to the muxer."));
-      return rv;
-    }
-
-    // Done with single encoder case.
-    return NS_OK;
-  }
-
-  // If we reach here we have both video and audio encoders, so we interleave
-  // the frames.
-  nsTArray<RefPtr<EncodedFrame>> frames;
-  RefPtr<EncodedFrame> videoFrame;
-  RefPtr<EncodedFrame> audioFrame;
-  // The times at which we expect our next video and audio frames. These are
-  // based on the time + duration (GetEndTime()) of the last seen frames.
-  // Assumes that the encoders write the correct duration for frames.
-  uint64_t expectedNextVideoTime = 0;
-  uint64_t expectedNextAudioTime = 0;
-  // Interleave frames until we're out of audio or video
-  while (mEncodedVideoFrames.GetSize() > 0 &&
-         mEncodedAudioFrames.GetSize() > 0) {
-    videoFrame = mEncodedVideoFrames.PeekFront();
-    audioFrame = mEncodedAudioFrames.PeekFront();
-    // For any expected time our frames should occur at or after that time.
-    MOZ_ASSERT(videoFrame->mTime >= expectedNextVideoTime);
-    MOZ_ASSERT(audioFrame->mTime >= expectedNextAudioTime);
-    if (videoFrame->mTime <= audioFrame->mTime) {
-      expectedNextVideoTime = videoFrame->GetEndTime();
-      RefPtr<EncodedFrame> frame = mEncodedVideoFrames.PopFront();
-      frames.AppendElement(frame);
-    } else {
-      expectedNextAudioTime = audioFrame->GetEndTime();
-      RefPtr<EncodedFrame> frame = mEncodedAudioFrames.PopFront();
-      frames.AppendElement(frame);
-    }
-  }
-
-  // If we're out of audio we still may be able to add more video...
-  if (mEncodedAudioFrames.GetSize() == 0) {
-    while (mEncodedVideoFrames.GetSize() > 0) {
-      videoFrame = mEncodedVideoFrames.PeekFront();
-      // If audio encoding is not complete and if the video frame would come
-      // after our next audio frame we cannot safely add it.
-      if (!mAudioEncoder->IsEncodingComplete() &&
-          videoFrame->mTime > expectedNextAudioTime) {
-        break;
-      }
-      frames.AppendElement(mEncodedVideoFrames.PopFront());
-    }
-  }
-
-  // If we're out of video we still may be able to add more audio...
-  if (mEncodedVideoFrames.GetSize() == 0) {
-    while (mEncodedAudioFrames.GetSize() > 0) {
-      audioFrame = mEncodedAudioFrames.PeekFront();
-      // If video encoding is not complete and if the audio frame would come
-      // after our next video frame we cannot safely add it.
-      if (!mVideoEncoder->IsEncodingComplete() &&
-          audioFrame->mTime > expectedNextVideoTime) {
-        break;
-      }
-      frames.AppendElement(mEncodedAudioFrames.PopFront());
-    }
-  }
-
-  // If encoding is complete for both encoders we should signal end of stream,
-  // otherwise we keep going.
-  uint32_t flags =
-      mVideoEncoder->IsEncodingComplete() && mAudioEncoder->IsEncodingComplete()
-          ? ContainerWriter::END_OF_STREAM
-          : 0;
-  nsresult rv = mWriter->WriteEncodedTrack(frames, flags);
-  if (NS_FAILED(rv)) {
-    LOG(LogLevel::Error, ("Error! Fail to write encoded video + audio track "
-                          "to the media container."));
-  }
-  return rv;
-}
-
-nsresult MediaEncoder::CopyMetadataToMuxer(TrackEncoder* aTrackEncoder) {
-  AUTO_PROFILER_LABEL("MediaEncoder::CopyMetadataToMuxer", OTHER);
-
-  MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
-
-  if (!aTrackEncoder) {
-    NS_ERROR("No track encoder to get metadata from");
-    return NS_ERROR_FAILURE;
-  }
-
-  RefPtr<TrackMetadataBase> meta = aTrackEncoder->GetMetadata();
-  if (meta == nullptr) {
-    LOG(LogLevel::Error, ("metadata == null"));
-    SetError();
-    return NS_ERROR_ABORT;
-  }
-
-  // In the case of Opus we need to calculate the codec delay based on the
-  // pre-skip. For more information see:
-  // https://tools.ietf.org/html/rfc7845#section-4.2
-  if (meta->GetKind() == TrackMetadataBase::MetadataKind::METADATA_OPUS) {
-    // Calculate offset in microseconds
-    OpusMetadata* opusMeta = static_cast<OpusMetadata*>(meta.get());
-    mAudioCodecDelay = static_cast<uint64_t>(
-        LittleEndian::readUint16(opusMeta->mIdHeader.Elements() + 10) *
-        PR_USEC_PER_SEC / 48000);
-  }
-
-  nsresult rv = mWriter->SetMetadata(meta);
-  if (NS_FAILED(rv)) {
-    LOG(LogLevel::Error, ("SetMetadata failed"));
-    SetError();
-  }
-  return rv;
-}
-
 bool MediaEncoder::IsShutdown() {
   MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
   return mShutdown;
@@ -1152,6 +940,11 @@ bool MediaEncoder::IsWebMEncoderEnabled() {
 }
 #endif
 
+const nsString& MediaEncoder::MimeType() const {
+  MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
+  return mMIMEType;
+}
+
 void MediaEncoder::NotifyInitialized() {
   MOZ_ASSERT(mEncoderThread->IsCurrentThreadIn());
 
diff --git a/dom/media/encoder/MediaEncoder.h b/dom/media/encoder/MediaEncoder.h
index ef624c14747a..431d91ac2e35 100644
--- a/dom/media/encoder/MediaEncoder.h
+++ b/dom/media/encoder/MediaEncoder.h
@@ -20,6 +20,7 @@
 namespace mozilla {
 
 class DriftCompensator;
+class Muxer;
 class Runnable;
 class TaskQueue;
 
@@ -77,29 +78,21 @@ class MediaEncoderListener {
  *    been initialized and when there's data available.
  *    => encoder->RegisterListener(listener);
  *
- * 3) Connect the MediaStreamTracks to be recorded.
- *    => encoder->ConnectMediaStreamTrack(track);
- *    This creates the corresponding TrackEncoder and connects the track and
- *    the TrackEncoder through a track listener. This also starts encoding.
- *
- * 4) When the MediaEncoderListener is notified that the MediaEncoder is
- *    initialized, we can encode metadata.
- *    => encoder->GetEncodedMetadata(...);
- *
- * 5) When the MediaEncoderListener is notified that the MediaEncoder has
- *    data available, we can encode data.
+ * 3) When the MediaEncoderListener is notified that the MediaEncoder has
+ *    data available, we can encode data. This also encodes metadata on its
+ *    first invocation.
  *    => encoder->GetEncodedData(...);
  *
- * 6) To stop encoding, there are multiple options:
+ * 4) To stop encoding, there are multiple options:
  *
- *    6.1) Stop() for a graceful stop.
+ *    4.1) Stop() for a graceful stop.
  *         => encoder->Stop();
  *
- *    6.2) Cancel() for an immediate stop, if you don't need the data currently
+ *    4.2) Cancel() for an immediate stop, if you don't need the data currently
  *         buffered.
  *         => encoder->Cancel();
  *
- *    6.3) When all input tracks end, the MediaEncoder will automatically stop
+ *    4.3) When all input tracks end, the MediaEncoder will automatically stop
  *         and shut down.
  */
 class MediaEncoder {
@@ -158,24 +151,12 @@ class MediaEncoder {
       uint32_t aAudioBitrate, uint32_t aVideoBitrate, uint8_t aTrackTypes,
       TrackRate aTrackRate);
 
-  /**
-   * Encodes raw metadata for all tracks to aOutputBufs. aMIMEType is the valid
-   * mime-type for the returned container data. The buffer of container data is
-   * allocated in ContainerWriter::GetContainerData().
-   *
-   * Should there be insufficient input data for either track encoder to infer
-   * the metadata, or if metadata has already been encoded, we return an error
-   * and the output arguments are undefined. Otherwise we return NS_OK.
-   */
-  nsresult GetEncodedMetadata(nsTArray<nsTArray<uint8_t>>* aOutputBufs,
-                              nsAString& aMIMEType);
   /**
    * Encodes raw data for all tracks to aOutputBufs. The buffer of container
    * data is allocated in ContainerWriter::GetContainerData().
    *
-   * This implies that metadata has already been encoded and that all track
-   * encoders are still active. Should either implication break, we return an
-   * error and the output argument is undefined. Otherwise we return NS_OK.
+   * On its first call, metadata is also encoded. TrackEncoders must have been
+   * initialized before this is called.
    */
   nsresult GetEncodedData(nsTArray<nsTArray<uint8_t>>* aOutputBufs);
 
@@ -197,6 +178,8 @@ class MediaEncoder {
   static bool IsWebMEncoderEnabled();
 #endif
 
+  const nsString& MimeType() const;
+
   /**
    * Notifies listeners that this MediaEncoder has been initialized.
    */
@@ -254,17 +237,10 @@ class MediaEncoder {
    */
   void SetError();
 
-  // Get metadata from trackEncoder and copy to muxer
-  nsresult CopyMetadataToMuxer(TrackEncoder* aTrackEncoder);
-  // Process data pending in encoder(s)
-  nsresult EncodeData();
-  // Write pending encoded data to muxer
-  nsresult WriteEncodedDataToMuxer();
-
   const RefPtr<TaskQueue> mEncoderThread;
   const RefPtr<DriftCompensator> mDriftCompensator;
 
-  UniquePtr<ContainerWriter> mWriter;
+  UniquePtr<Muxer> mMuxer;
   RefPtr<AudioTrackEncoder> mAudioEncoder;
   RefPtr<AudioTrackListener> mAudioListener;
   RefPtr<VideoTrackEncoder> mVideoEncoder;
@@ -288,19 +264,9 @@ class MediaEncoder {
   // doesn't contain video on start() or if the input is an AudioNode.
   RefPtr<dom::VideoStreamTrack> mVideoTrack;
 
-  // Audio frames that have been encoded and are pending write to the muxer
-  MediaQueue<EncodedFrame> mEncodedAudioFrames;
-  // Video frames that have been encoded and are pending write to the muxer
-  MediaQueue<EncodedFrame> mEncodedVideoFrames;
-
-  // How much each audio time stamp should be delayed in microseconds. Used to
-  // adjust for opus codec delay.
-  uint64_t mAudioCodecDelay = 0;
-
   TimeStamp mStartTime;
-  nsString mMIMEType;
+  const nsString mMIMEType;
   bool mInitialized;
-  bool mMetadataEncoded;
   bool mCompleted;
   bool mError;
   bool mCanceled;
diff --git a/dom/media/encoder/Muxer.cpp b/dom/media/encoder/Muxer.cpp
new file mode 100644
index 000000000000..f20d1a7e3238
--- /dev/null
+++ b/dom/media/encoder/Muxer.cpp
@@ -0,0 +1,228 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Muxer.h"
+
+#include "ContainerWriter.h"
+
+namespace mozilla {
+
+LazyLogModule gMuxerLog("Muxer");
+#define LOG(type, ...) MOZ_LOG(gMuxerLog, type, (__VA_ARGS__))
+
+Muxer::Muxer(UniquePtr<ContainerWriter> aWriter)
+    : mWriter(std::move(aWriter)) {}
+
+bool Muxer::IsFinished() { return mWriter->IsWritingComplete(); }
+
+nsresult Muxer::SetMetadata(
+    const nsTArray<RefPtr<TrackMetadataBase>>& aMetadata) {
+  nsresult rv = mWriter->SetMetadata(aMetadata);
+  if (NS_FAILED(rv)) {
+    LOG(LogLevel::Error, "%p Setting metadata failed, tracks=%zu", this,
+        aMetadata.Length());
+    return rv;
+  }
+
+  for (const auto& track : aMetadata) {
+    switch (track->GetKind()) {
+      case TrackMetadataBase::METADATA_OPUS: {
+        // In the case of Opus we need to calculate the codec delay based on the
+        // pre-skip. For more information see:
+        // https://tools.ietf.org/html/rfc7845#section-4.2
+        // Calculate offset in microseconds
+        OpusMetadata* opusMeta = static_cast<OpusMetadata*>(track.get());
+        mAudioCodecDelay = static_cast<uint64_t>(
+            LittleEndian::readUint16(opusMeta->mIdHeader.Elements() + 10) *
+            PR_USEC_PER_SEC / 48000);
+        MOZ_FALLTHROUGH;
+      }
+      case TrackMetadataBase::METADATA_VORBIS:
+      case TrackMetadataBase::METADATA_AAC:
+      case TrackMetadataBase::METADATA_AMR:
+      case TrackMetadataBase::METADATA_EVRC:
+        MOZ_ASSERT(!mHasAudio, "Only one audio track supported");
+        mHasAudio = true;
+        break;
+      case TrackMetadataBase::METADATA_VP8:
+        MOZ_ASSERT(!mHasVideo, "Only one video track supported");
+        mHasVideo = true;
+        break;
+      default:
+        MOZ_CRASH("Unknown codec metadata");
+    };
+  }
+  mMetadataSet = true;
+  MOZ_ASSERT(mHasAudio || mHasVideo);
+  if (!mHasAudio) {
+    mEncodedAudioFrames.Finish();
+    MOZ_ASSERT(mEncodedAudioFrames.AtEndOfStream());
+  }
+  if (!mHasVideo) {
+    mEncodedVideoFrames.Finish();
+    MOZ_ASSERT(mEncodedVideoFrames.AtEndOfStream());
+  }
+  LOG(LogLevel::Info, "%p Metadata set; audio=%d, video=%d", this, mHasAudio,
+      mHasVideo);
+  return rv;
+}
+
+void Muxer::AddEncodedAudioFrame(EncodedFrame* aFrame) {
+  MOZ_ASSERT(mMetadataSet);
+  MOZ_ASSERT(mHasAudio);
+  if (aFrame->mFrameType == EncodedFrame::FrameType::OPUS_AUDIO_FRAME) {
+    aFrame->mTime += mAudioCodecDelay;
+  }
+  mEncodedAudioFrames.Push(aFrame);
+  LOG(LogLevel::Verbose,
+      "%p Added audio frame of type %u, [start %" PRIu64 ", end %" PRIu64 ")",
+      this, aFrame->mFrameType, aFrame->mTime,
+      aFrame->mTime + aFrame->mDuration);
+}
+
+void Muxer::AddEncodedVideoFrame(EncodedFrame* aFrame) {
+  MOZ_ASSERT(mMetadataSet);
+  MOZ_ASSERT(mHasVideo);
+  mEncodedVideoFrames.Push(aFrame);
+  LOG(LogLevel::Verbose,
+      "%p Added video frame of type %u, [start %" PRIu64 ", end %" PRIu64 ")",
+      this, aFrame->mFrameType, aFrame->mTime,
+      aFrame->mTime + aFrame->mDuration);
+}
+
+void Muxer::AudioEndOfStream() {
+  MOZ_ASSERT(mMetadataSet);
+  MOZ_ASSERT(mHasAudio);
+  LOG(LogLevel::Info, "%p Reached audio EOS", this);
+  mEncodedAudioFrames.Finish();
+}
+
+void Muxer::VideoEndOfStream() {
+  MOZ_ASSERT(mMetadataSet);
+  MOZ_ASSERT(mHasVideo);
+  LOG(LogLevel::Info, "%p Reached video EOS", this);
+  mEncodedVideoFrames.Finish();
+}
+
+nsresult Muxer::GetData(nsTArray<nsTArray<uint8_t>>* aOutputBuffers) {
+  MOZ_ASSERT(mMetadataSet);
+  MOZ_ASSERT(mHasAudio || mHasVideo);
+
+  nsresult rv;
+  if (!mMetadataEncoded) {
+    rv = mWriter->GetContainerData(aOutputBuffers, ContainerWriter::GET_HEADER);
+    if (NS_FAILED(rv)) {
+      LOG(LogLevel::Error, "%p Failed getting metadata from writer", this);
+      return rv;
+    }
+    mMetadataEncoded = true;
+  }
+
+  if (mEncodedAudioFrames.GetSize() == 0 && !mEncodedAudioFrames.IsFinished() &&
+      mEncodedVideoFrames.GetSize() == 0 && !mEncodedVideoFrames.IsFinished()) {
+    // Nothing to mux.
+    return NS_OK;
+  }
+
+  rv = Mux();
+  if (NS_FAILED(rv)) {
+    LOG(LogLevel::Error, "%p Failed muxing data into writer", this);
+    return rv;
+  }
+
+  MOZ_ASSERT_IF(
+      mEncodedAudioFrames.IsFinished() && mEncodedVideoFrames.IsFinished(),
+      mEncodedAudioFrames.AtEndOfStream());
+  MOZ_ASSERT_IF(
+      mEncodedAudioFrames.IsFinished() && mEncodedVideoFrames.IsFinished(),
+      mEncodedVideoFrames.AtEndOfStream());
+  uint32_t flags =
+      mEncodedAudioFrames.AtEndOfStream() && mEncodedVideoFrames.AtEndOfStream()
+          ? ContainerWriter::FLUSH_NEEDED
+          : 0;
+
+  if (mEncodedAudioFrames.AtEndOfStream() &&
+      mEncodedVideoFrames.AtEndOfStream()) {
+    LOG(LogLevel::Info, "%p All data written", this);
+  }
+
+  return mWriter->GetContainerData(aOutputBuffers, flags);
+}
+
+nsresult Muxer::Mux() {
+  MOZ_ASSERT(mMetadataSet);
+  MOZ_ASSERT(mHasAudio || mHasVideo);
+
+  nsTArray<RefPtr<EncodedFrame>> frames;
+  // The times at which we expect our next video and audio frames. These are
+  // based on the time + duration (GetEndTime()) of the last seen frames.
+  // Assumes that the encoders write the correct duration for frames.;
+  uint64_t expectedNextVideoTime = 0;
+  uint64_t expectedNextAudioTime = 0;
+  // Interleave frames until we're out of audio or video
+  while (mEncodedVideoFrames.GetSize() > 0 &&
+         mEncodedAudioFrames.GetSize() > 0) {
+    RefPtr<EncodedFrame> videoFrame = mEncodedVideoFrames.PeekFront();
+    RefPtr<EncodedFrame> audioFrame = mEncodedAudioFrames.PeekFront();
+    // For any expected time our frames should occur at or after that time.
+    MOZ_ASSERT(videoFrame->mTime >= expectedNextVideoTime);
+    MOZ_ASSERT(audioFrame->mTime >= expectedNextAudioTime);
+    if (videoFrame->mTime <= audioFrame->mTime) {
+      expectedNextVideoTime = videoFrame->GetEndTime();
+      RefPtr<EncodedFrame> frame = mEncodedVideoFrames.PopFront();
+      frames.AppendElement(frame);
+    } else {
+      expectedNextAudioTime = audioFrame->GetEndTime();
+      RefPtr<EncodedFrame> frame = mEncodedAudioFrames.PopFront();
+      frames.AppendElement(frame);
+    }
+  }
+
+  // If we're out of audio we still may be able to add more video...
+  if (mEncodedAudioFrames.GetSize() == 0) {
+    while (mEncodedVideoFrames.GetSize() > 0) {
+      if (!mEncodedAudioFrames.AtEndOfStream() &&
+          mEncodedVideoFrames.PeekFront()->mTime > expectedNextAudioTime) {
+        // Audio encoding is not complete and since the video frame comes
+        // after our next audio frame we cannot safely add it.
+        break;
+      }
+      frames.AppendElement(mEncodedVideoFrames.PopFront());
+    }
+  }
+
+  // If we're out of video we still may be able to add more audio...
+  if (mEncodedVideoFrames.GetSize() == 0) {
+    while (mEncodedAudioFrames.GetSize() > 0) {
+      if (!mEncodedVideoFrames.AtEndOfStream() &&
+          mEncodedAudioFrames.PeekFront()->mTime > expectedNextVideoTime) {
+        // Video encoding is not complete and since the audio frame comes
+        // after our next video frame we cannot safely add it.
+        break;
+      }
+      frames.AppendElement(mEncodedAudioFrames.PopFront());
+    }
+  }
+
+  LOG(LogLevel::Debug,
+      "%p Muxed data, remaining-audio=%zu, remaining-video=%zu", this,
+      mEncodedAudioFrames.GetSize(), mEncodedVideoFrames.GetSize());
+
+  // If encoding is complete for both encoders we should signal end of stream,
+  // otherwise we keep going.
+  uint32_t flags =
+      mEncodedVideoFrames.AtEndOfStream() && mEncodedAudioFrames.AtEndOfStream()
+          ? ContainerWriter::END_OF_STREAM
+          : 0;
+  nsresult rv = mWriter->WriteEncodedTrack(frames, flags);
+  if (NS_FAILED(rv)) {
+    LOG(LogLevel::Error, "Error! Failed to write muxed data to the container");
+  }
+  return rv;
+}
+
+}  // namespace mozilla
+
+#undef LOG
diff --git a/dom/media/encoder/Muxer.h b/dom/media/encoder/Muxer.h
new file mode 100644
index 000000000000..f2f93582d3cf
--- /dev/null
+++ b/dom/media/encoder/Muxer.h
@@ -0,0 +1,74 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef DOM_MEDIA_ENCODER_MUXER_H_
+#define DOM_MEDIA_ENCODER_MUXER_H_
+
+#include "MediaQueue.h"
+
+namespace mozilla {
+
+class ContainerWriter;
+
+// Generic Muxer class that helps pace the output from track encoders to the
+// ContainerWriter, so time never appears to go backwards.
+// Note that the entire class is written for single threaded access.
+class Muxer {
+ public:
+  explicit Muxer(UniquePtr<ContainerWriter> aWriter);
+  ~Muxer() = default;
+
+  // Returns true when all tracks have ended, and all data has been muxed and
+  // fetched.
+  bool IsFinished();
+
+  // Returns true if this muxer has not been given metadata yet.
+  bool NeedsMetadata() const { return !mMetadataSet; }
+
+  // Sets metadata for all tracks. This may only be called once.
+  nsresult SetMetadata(const nsTArray<RefPtr<TrackMetadataBase>>& aMetadata);
+
+  // Adds an encoded audio frame for muxing
+  void AddEncodedAudioFrame(EncodedFrame* aFrame);
+
+  // Adds an encoded video frame for muxing
+  void AddEncodedVideoFrame(EncodedFrame* aFrame);
+
+  // Marks the audio track as ended. Once all tracks for which we have metadata
+  // have ended, GetData() will drain and the muxer will be marked as finished.
+  void AudioEndOfStream();
+
+  // Marks the video track as ended. Once all tracks for which we have metadata
+  // have ended, GetData() will drain and the muxer will be marked as finished.
+  void VideoEndOfStream();
+
+  // Gets the data that has been muxed and written into the container so far.
+  nsresult GetData(nsTArray<nsTArray<uint8_t>>* aOutputBuffers);
+
+ private:
+  // Writes data in MediaQueues to the ContainerWriter.
+  nsresult Mux();
+
+  // Audio frames that have been encoded and are pending write to the muxer.
+  MediaQueue<EncodedFrame> mEncodedAudioFrames;
+  // Video frames that have been encoded and are pending write to the muxer.
+  MediaQueue<EncodedFrame> mEncodedVideoFrames;
+  // The writer for the specific container we're recording into.
+  UniquePtr<ContainerWriter> mWriter;
+  // How much each audio time stamp should be delayed in microseconds. Used to
+  // adjust for opus codec delay.
+  uint64_t mAudioCodecDelay = 0;
+  // True once metadata has been set in the muxer.
+  bool mMetadataSet = false;
+  // True once metadata has been written to file.
+  bool mMetadataEncoded = false;
+  // True if metadata is set and contains an audio track.
+  bool mHasAudio = false;
+  // True if metadata is set and contains a video track.
+  bool mHasVideo = false;
+};
+}  // namespace mozilla
+
+#endif
diff --git a/dom/media/encoder/moz.build b/dom/media/encoder/moz.build
index e08dd8cd9963..dd1730553fc5 100644
--- a/dom/media/encoder/moz.build
+++ b/dom/media/encoder/moz.build
@@ -18,6 +18,7 @@ EXPORTS += [
 
 UNIFIED_SOURCES += [
     'MediaEncoder.cpp',
+    'Muxer.cpp',
     'OpusTrackEncoder.cpp',
     'TrackEncoder.cpp',
 ]
diff --git a/dom/media/gtest/TestWebMWriter.cpp b/dom/media/gtest/TestWebMWriter.cpp
index 3576d095a363..800c8128e86a 100644
--- a/dom/media/gtest/TestWebMWriter.cpp
+++ b/dom/media/gtest/TestWebMWriter.cpp
@@ -40,28 +40,30 @@ class WebMVP8TrackEncoder : public VP8TrackEncoder {
   }
 };
 
+static void GetOpusMetadata(int aChannels, int aSampleRate,
+                            TrackRate aTrackRate,
+                            nsTArray<RefPtr<TrackMetadataBase>>& aMeta) {
+  WebMOpusTrackEncoder opusEncoder(aTrackRate);
+  EXPECT_TRUE(opusEncoder.TestOpusCreation(aChannels, aSampleRate));
+  aMeta.AppendElement(opusEncoder.GetMetadata());
+}
+
+static void GetVP8Metadata(int32_t aWidth, int32_t aHeight,
+                           int32_t aDisplayWidth, int32_t aDisplayHeight,
+                           TrackRate aTrackRate,
+                           nsTArray<RefPtr<TrackMetadataBase>>& aMeta) {
+  WebMVP8TrackEncoder vp8Encoder;
+  EXPECT_TRUE(vp8Encoder.TestVP8Creation(aWidth, aHeight, aDisplayWidth,
+                                         aDisplayHeight));
+  aMeta.AppendElement(vp8Encoder.GetMetadata());
+}
+
 const uint64_t FIXED_DURATION = 1000000;
 const uint32_t FIXED_FRAMESIZE = 500;
 
 class TestWebMWriter : public WebMWriter {
  public:
-  explicit TestWebMWriter(int aTrackTypes)
-      : WebMWriter(aTrackTypes), mTimestamp(0) {}
-
-  void SetOpusMetadata(int aChannels, int aSampleRate, TrackRate aTrackRate) {
-    WebMOpusTrackEncoder opusEncoder(aTrackRate);
-    EXPECT_TRUE(opusEncoder.TestOpusCreation(aChannels, aSampleRate));
-    RefPtr<TrackMetadataBase> opusMeta = opusEncoder.GetMetadata();
-    SetMetadata(opusMeta);
-  }
-  void SetVP8Metadata(int32_t aWidth, int32_t aHeight, int32_t aDisplayWidth,
-                      int32_t aDisplayHeight, TrackRate aTrackRate) {
-    WebMVP8TrackEncoder vp8Encoder;
-    EXPECT_TRUE(vp8Encoder.TestVP8Creation(aWidth, aHeight, aDisplayWidth,
-                                           aDisplayHeight));
-    RefPtr<TrackMetadataBase> vp8Meta = vp8Encoder.GetMetadata();
-    SetMetadata(vp8Meta);
-  }
+  TestWebMWriter() : WebMWriter(), mTimestamp(0) {}
 
   // When we append an I-Frame into WebM muxer, the muxer will treat previous
   // data as "a cluster".
@@ -96,8 +98,7 @@ class TestWebMWriter : public WebMWriter {
 
 TEST(WebMWriter, Metadata)
 {
-  TestWebMWriter writer(ContainerWriter::CREATE_AUDIO_TRACK |
-                        ContainerWriter::CREATE_VIDEO_TRACK);
+  TestWebMWriter writer;
 
   // The output should be empty since we didn't set any metadata in writer.
   nsTArray<nsTArray<uint8_t>> encodedBuf;
@@ -106,25 +107,23 @@ TEST(WebMWriter, Metadata)
   writer.GetContainerData(&encodedBuf, ContainerWriter::FLUSH_NEEDED);
   EXPECT_TRUE(encodedBuf.Length() == 0);
 
-  // Set opus metadata.
+  nsTArray<RefPtr<TrackMetadataBase>> meta;
+
+  // Get opus metadata.
   int channel = 1;
   int sampleRate = 44100;
   TrackRate aTrackRate = 90000;
-  writer.SetOpusMetadata(channel, sampleRate, aTrackRate);
+  GetOpusMetadata(channel, sampleRate, aTrackRate, meta);
 
-  // No output data since we didn't set both audio/video
-  // metadata in writer.
-  writer.GetContainerData(&encodedBuf, ContainerWriter::GET_HEADER);
-  EXPECT_TRUE(encodedBuf.Length() == 0);
-  writer.GetContainerData(&encodedBuf, ContainerWriter::FLUSH_NEEDED);
-  EXPECT_TRUE(encodedBuf.Length() == 0);
-
-  // Set vp8 metadata
+  // Get vp8 metadata
   int32_t width = 640;
   int32_t height = 480;
   int32_t displayWidth = 640;
   int32_t displayHeight = 480;
-  writer.SetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate);
+  GetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate, meta);
+
+  // Set metadata
+  writer.SetMetadata(meta);
 
   writer.GetContainerData(&encodedBuf, ContainerWriter::GET_HEADER);
   EXPECT_TRUE(encodedBuf.Length() > 0);
@@ -132,19 +131,20 @@ TEST(WebMWriter, Metadata)
 
 TEST(WebMWriter, Cluster)
 {
-  TestWebMWriter writer(ContainerWriter::CREATE_AUDIO_TRACK |
-                        ContainerWriter::CREATE_VIDEO_TRACK);
-  // Set opus metadata.
+  TestWebMWriter writer;
+  nsTArray<RefPtr<TrackMetadataBase>> meta;
+  // Get opus metadata.
   int channel = 1;
   int sampleRate = 48000;
   TrackRate aTrackRate = 90000;
-  writer.SetOpusMetadata(channel, sampleRate, aTrackRate);
-  // Set vp8 metadata
+  GetOpusMetadata(channel, sampleRate, aTrackRate, meta);
+  // Get vp8 metadata
   int32_t width = 320;
   int32_t height = 240;
   int32_t displayWidth = 320;
   int32_t displayHeight = 240;
-  writer.SetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate);
+  GetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate, meta);
+  writer.SetMetadata(meta);
 
   nsTArray<nsTArray<uint8_t>> encodedBuf;
   writer.GetContainerData(&encodedBuf, ContainerWriter::GET_HEADER);
@@ -174,19 +174,20 @@ TEST(WebMWriter, Cluster)
 
 TEST(WebMWriter, FLUSH_NEEDED)
 {
-  TestWebMWriter writer(ContainerWriter::CREATE_AUDIO_TRACK |
-                        ContainerWriter::CREATE_VIDEO_TRACK);
-  // Set opus metadata.
+  TestWebMWriter writer;
+  nsTArray<RefPtr<TrackMetadataBase>> meta;
+  // Get opus metadata.
   int channel = 2;
   int sampleRate = 44100;
   TrackRate aTrackRate = 100000;
-  writer.SetOpusMetadata(channel, sampleRate, aTrackRate);
-  // Set vp8 metadata
+  GetOpusMetadata(channel, sampleRate, aTrackRate, meta);
+  // Get vp8 metadata
   int32_t width = 176;
   int32_t height = 352;
   int32_t displayWidth = 176;
   int32_t displayHeight = 352;
-  writer.SetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate);
+  GetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate, meta);
+  writer.SetMetadata(meta);
 
   // write the first I-Frame.
   writer.AppendDummyFrame(EncodedFrame::VP8_I_FRAME, FIXED_DURATION);
@@ -294,19 +295,20 @@ static int64_t webm_tell(void* aUserData) {
 
 TEST(WebMWriter, bug970774_aspect_ratio)
 {
-  TestWebMWriter writer(ContainerWriter::CREATE_AUDIO_TRACK |
-                        ContainerWriter::CREATE_VIDEO_TRACK);
-  // Set opus metadata.
+  TestWebMWriter writer;
+  nsTArray<RefPtr<TrackMetadataBase>> meta;
+  // Get opus metadata.
   int channel = 1;
   int sampleRate = 44100;
   TrackRate aTrackRate = 90000;
-  writer.SetOpusMetadata(channel, sampleRate, aTrackRate);
+  GetOpusMetadata(channel, sampleRate, aTrackRate, meta);
   // Set vp8 metadata
   int32_t width = 640;
   int32_t height = 480;
   int32_t displayWidth = 1280;
   int32_t displayHeight = 960;
-  writer.SetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate);
+  GetVP8Metadata(width, height, displayWidth, displayHeight, aTrackRate, meta);
+  writer.SetMetadata(meta);
 
   // write the first I-Frame.
   writer.AppendDummyFrame(EncodedFrame::VP8_I_FRAME, FIXED_DURATION);
diff --git a/dom/media/ogg/OggWriter.cpp b/dom/media/ogg/OggWriter.cpp
index 359fb510c796..bbe1414d5f86 100644
--- a/dom/media/ogg/OggWriter.cpp
+++ b/dom/media/ogg/OggWriter.cpp
@@ -142,12 +142,13 @@ nsresult OggWriter::GetContainerData(nsTArray<nsTArray<uint8_t>>* aOutputBufs,
     rc = ogg_stream_flush(&mOggStreamState, &mOggPage);
     NS_ENSURE_TRUE(rc > 0, NS_ERROR_FAILURE);
 
-    ProduceOggPage(aOutputBufs);
-    return NS_OK;
-
     // Force generate a page even if the amount of packet data is not enough.
     // Usually do so after a header packet.
-  } else if (aFlags & ContainerWriter::FLUSH_NEEDED) {
+
+    ProduceOggPage(aOutputBufs);
+  }
+
+  if (aFlags & ContainerWriter::FLUSH_NEEDED) {
     // rc = 0 means no packet to put into a page, or an internal error.
     rc = ogg_stream_flush(&mOggStreamState, &mOggPage);
   } else {
@@ -162,20 +163,25 @@ nsresult OggWriter::GetContainerData(nsTArray<nsTArray<uint8_t>>* aOutputBufs,
   if (aFlags & ContainerWriter::FLUSH_NEEDED) {
     mIsWritingComplete = true;
   }
-  return (rc > 0) ? NS_OK : NS_ERROR_FAILURE;
+  // We always return NS_OK here since it's OK to call this without having
+  // enough data to fill a page. It's the more common case compared to internal
+  // errors, and we cannot distinguish the two.
+  return NS_OK;
 }
 
-nsresult OggWriter::SetMetadata(TrackMetadataBase* aMetadata) {
-  MOZ_ASSERT(aMetadata);
+nsresult OggWriter::SetMetadata(
+    const nsTArray<RefPtr<TrackMetadataBase>>& aMetadata) {
+  MOZ_ASSERT(aMetadata.Length() == 1);
+  MOZ_ASSERT(aMetadata[0]);
 
   AUTO_PROFILER_LABEL("OggWriter::SetMetadata", OTHER);
 
-  if (aMetadata->GetKind() != TrackMetadataBase::METADATA_OPUS) {
+  if (aMetadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) {
     LOG("wrong meta data type!");
     return NS_ERROR_FAILURE;
   }
   // Validate each field of METADATA
-  mMetadata = static_cast<OpusMetadata*>(aMetadata);
+  mMetadata = static_cast<OpusMetadata*>(aMetadata[0].get());
   if (mMetadata->mIdHeader.Length() == 0) {
     LOG("miss mIdHeader!");
     return NS_ERROR_FAILURE;
diff --git a/dom/media/ogg/OggWriter.h b/dom/media/ogg/OggWriter.h
index 0ce8753683da..73d5bd87e996 100644
--- a/dom/media/ogg/OggWriter.h
+++ b/dom/media/ogg/OggWriter.h
@@ -32,7 +32,8 @@ class OggWriter : public ContainerWriter {
                             uint32_t aFlags = 0) override;
 
   // Check metadata type integrity and reject unacceptable track encoder.
-  nsresult SetMetadata(TrackMetadataBase* aMetadata) override;
+  nsresult SetMetadata(
+      const nsTArray<RefPtr<TrackMetadataBase>>& aMetadata) override;
 
  private:
   nsresult Init();
diff --git a/dom/media/webm/WebMWriter.cpp b/dom/media/webm/WebMWriter.cpp
index d3847e52a9bf..eee7f6030f35 100644
--- a/dom/media/webm/WebMWriter.cpp
+++ b/dom/media/webm/WebMWriter.cpp
@@ -10,8 +10,7 @@
 
 namespace mozilla {
 
-WebMWriter::WebMWriter(uint32_t aTrackTypes) : ContainerWriter() {
-  mMetadataRequiredFlag = aTrackTypes;
+WebMWriter::WebMWriter() : ContainerWriter() {
   mEbmlComposer = new EbmlComposer();
 }
 
@@ -38,40 +37,75 @@ nsresult WebMWriter::GetContainerData(nsTArray<nsTArray<uint8_t>>* aOutputBufs,
   return NS_OK;
 }
 
-nsresult WebMWriter::SetMetadata(TrackMetadataBase* aMetadata) {
-  MOZ_ASSERT(aMetadata);
+nsresult WebMWriter::SetMetadata(
+    const nsTArray<RefPtr<TrackMetadataBase>>& aMetadata) {
   AUTO_PROFILER_LABEL("WebMWriter::SetMetadata", OTHER);
+  MOZ_DIAGNOSTIC_ASSERT(!aMetadata.IsEmpty());
 
-  if (aMetadata->GetKind() == TrackMetadataBase::METADATA_VP8) {
-    VP8Metadata* meta = static_cast<VP8Metadata*>(aMetadata);
-    MOZ_ASSERT(meta, "Cannot find vp8 encoder metadata");
-    mEbmlComposer->SetVideoConfig(meta->mWidth, meta->mHeight,
-                                  meta->mDisplayWidth, meta->mDisplayHeight);
-    mMetadataRequiredFlag =
-        mMetadataRequiredFlag & ~ContainerWriter::CREATE_VIDEO_TRACK;
+  // Integrity checks
+  bool bad = false;
+  for (const RefPtr<TrackMetadataBase>& metadata : aMetadata) {
+    MOZ_ASSERT(metadata);
+
+    if (metadata->GetKind() == TrackMetadataBase::METADATA_VP8) {
+      VP8Metadata* meta = static_cast<VP8Metadata*>(metadata.get());
+      if (meta->mWidth == 0 || meta->mHeight == 0 || meta->mDisplayWidth == 0 ||
+          meta->mDisplayHeight == 0) {
+        bad = true;
+      }
+    }
+
+    if (metadata->GetKind() == TrackMetadataBase::METADATA_VORBIS) {
+      VorbisMetadata* meta = static_cast<VorbisMetadata*>(metadata.get());
+      if (meta->mSamplingFrequency == 0 || meta->mChannels == 0 ||
+          meta->mData.IsEmpty()) {
+        bad = true;
+      }
+    }
+
+    if (metadata->GetKind() == TrackMetadataBase::METADATA_OPUS) {
+      OpusMetadata* meta = static_cast<OpusMetadata*>(metadata.get());
+      if (meta->mSamplingFrequency == 0 || meta->mChannels == 0 ||
+          meta->mIdHeader.IsEmpty()) {
+        bad = true;
+      }
+    }
+  }
+  if (bad) {
+    return NS_ERROR_FAILURE;
   }
 
-  if (aMetadata->GetKind() == TrackMetadataBase::METADATA_VORBIS) {
-    VorbisMetadata* meta = static_cast<VorbisMetadata*>(aMetadata);
-    MOZ_ASSERT(meta, "Cannot find vorbis encoder metadata");
-    mEbmlComposer->SetAudioConfig(meta->mSamplingFrequency, meta->mChannels);
-    mEbmlComposer->SetAudioCodecPrivateData(meta->mData);
-    mMetadataRequiredFlag =
-        mMetadataRequiredFlag & ~ContainerWriter::CREATE_AUDIO_TRACK;
-  }
+  // Storing
+  bool hasAudio = false;
+  bool hasVideo = false;
+  for (const RefPtr<TrackMetadataBase>& metadata : aMetadata) {
+    MOZ_ASSERT(metadata);
 
-  if (aMetadata->GetKind() == TrackMetadataBase::METADATA_OPUS) {
-    OpusMetadata* meta = static_cast<OpusMetadata*>(aMetadata);
-    MOZ_ASSERT(meta, "Cannot find Opus encoder metadata");
-    mEbmlComposer->SetAudioConfig(meta->mSamplingFrequency, meta->mChannels);
-    mEbmlComposer->SetAudioCodecPrivateData(meta->mIdHeader);
-    mMetadataRequiredFlag =
-        mMetadataRequiredFlag & ~ContainerWriter::CREATE_AUDIO_TRACK;
-  }
+    if (metadata->GetKind() == TrackMetadataBase::METADATA_VP8) {
+      MOZ_DIAGNOSTIC_ASSERT(!hasVideo);
+      VP8Metadata* meta = static_cast<VP8Metadata*>(metadata.get());
+      mEbmlComposer->SetVideoConfig(meta->mWidth, meta->mHeight,
+                                    meta->mDisplayWidth, meta->mDisplayHeight);
+      hasVideo = true;
+    }
 
-  if (!mMetadataRequiredFlag) {
-    mEbmlComposer->GenerateHeader();
+    if (metadata->GetKind() == TrackMetadataBase::METADATA_VORBIS) {
+      MOZ_DIAGNOSTIC_ASSERT(!hasAudio);
+      VorbisMetadata* meta = static_cast<VorbisMetadata*>(metadata.get());
+      mEbmlComposer->SetAudioConfig(meta->mSamplingFrequency, meta->mChannels);
+      mEbmlComposer->SetAudioCodecPrivateData(meta->mData);
+      hasAudio = true;
+    }
+
+    if (metadata->GetKind() == TrackMetadataBase::METADATA_OPUS) {
+      MOZ_DIAGNOSTIC_ASSERT(!hasAudio);
+      OpusMetadata* meta = static_cast<OpusMetadata*>(metadata.get());
+      mEbmlComposer->SetAudioConfig(meta->mSamplingFrequency, meta->mChannels);
+      mEbmlComposer->SetAudioCodecPrivateData(meta->mIdHeader);
+      hasAudio = true;
+    }
   }
+  mEbmlComposer->GenerateHeader();
   return NS_OK;
 }
 
diff --git a/dom/media/webm/WebMWriter.h b/dom/media/webm/WebMWriter.h
index 92ab92d70e0c..0af81d3d25b3 100644
--- a/dom/media/webm/WebMWriter.h
+++ b/dom/media/webm/WebMWriter.h
@@ -41,9 +41,8 @@ class VP8Metadata : public TrackMetadataBase {
  */
 class WebMWriter : public ContainerWriter {
  public:
-  // aTrackTypes indicate this muxer should multiplex into Video only or A/V
-  // foramt. Run in MediaRecorder thread
-  explicit WebMWriter(uint32_t aTrackTypes);
+  // Run in MediaRecorder thread
+  WebMWriter();
   virtual ~WebMWriter();
 
   // WriteEncodedTrack inserts raw packets into WebM stream. Does not accept
@@ -59,14 +58,11 @@ class WebMWriter : public ContainerWriter {
                             uint32_t aFlags = 0) override;
 
   // Assign metadata into muxer
-  nsresult SetMetadata(TrackMetadataBase* aMetadata) override;
+  nsresult SetMetadata(
+      const nsTArray<RefPtr<TrackMetadataBase>>& aMetadata) override;
 
  private:
   nsAutoPtr<EbmlComposer> mEbmlComposer;
-
-  // Indicate what kind of meta data needed in the writer.
-  // If this value become 0, it means writer can start to generate header.
-  uint8_t mMetadataRequiredFlag;
 };
 
 }  // namespace mozilla