Bug 1248897 - Introducing an online speech recognition service for Web Speech API r=smaug,pehrsons,padenot

This patch introduces a Speech Recognition Service which interfaces with Mozilla's remote STT endpoint which is currently being used by multiple services Differential Revision: https://phabricator.services.mozilla.com/D26047 --HG-- extra : moz-landing-system : lando
2024-11-30 00:01:50 +00:00 · 2019-10-21 20:58:57 +00:00 · 2019-10-21 20:58:57 +00:00 · 20834f4fb9
commit 20834f4fb9
parent 1fd2626e6a
32 changed files with 1412 additions and 142 deletions
--- a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
+++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.cpp
@ -0,0 +1,473 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsThreadUtils.h"
+#include "nsXPCOMCIDInternal.h"
+#include "OnlineSpeechRecognitionService.h"
+#include "nsIFile.h"
+#include "SpeechGrammar.h"
+#include "SpeechRecognition.h"
+#include "SpeechRecognitionAlternative.h"
+#include "SpeechRecognitionResult.h"
+#include "SpeechRecognitionResultList.h"
+#include "nsIObserverService.h"
+#include "mozilla/StaticPrefs_media.h"
+#include "mozilla/Services.h"
+#include "nsDirectoryServiceDefs.h"
+#include "nsDirectoryServiceUtils.h"
+#include "nsMemory.h"
+#include "nsNetUtil.h"
+#include "nsContentUtils.h"
+#include "nsIPrincipal.h"
+#include "nsIStreamListener.h"
+#include "nsIUploadChannel2.h"
+#include "mozilla/dom/ClientIPCTypes.h"
+#include "nsStringStream.h"
+#include "nsIOutputStream.h"
+#include "nsStreamUtils.h"
+#include "OpusTrackEncoder.h"
+#include "OggWriter.h"
+#include "nsIClassOfService.h"
+#include <json/json.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace mozilla {
+
+using namespace dom;
+using namespace std;
+
+#define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \
+  "media.webspeech.service.endpoint"
+#define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/"
+#define MAX_LISTENING_TIME_MS 10000
+
+NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService,
+                  nsIStreamListener)
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) {
+  MOZ_ASSERT(NS_IsMainThread());
+  return NS_OK;
+}
+
+static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure,
+                                       const char* aFromRawSegment,
+                                       uint32_t aToOffset, uint32_t aCount,
+                                       uint32_t* aWriteCount) {
+  nsCString* buf = static_cast<nsCString*>(aClosure);
+  buf->Append(aFromRawSegment, aCount);
+  *aWriteCount = aCount;
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest,
+                                                nsIInputStream* aInputStream,
+                                                uint64_t aOffset,
+                                                uint32_t aCount) {
+  MOZ_ASSERT(NS_IsMainThread());
+  nsresult rv;
+  uint32_t readCount;
+  rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount,
+                                  &readCount);
+  NS_ENSURE_SUCCESS(rv, rv);
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest,
+                                              nsresult aStatusCode) {
+  MOZ_ASSERT(NS_IsMainThread());
+
+  auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); });
+
+  if (mAborted) {
+    return NS_OK;
+  }
+
+  bool success;
+  float confidence = 0;
+  Json::Value root;
+  Json::CharReaderBuilder builder;
+  bool parsingSuccessful;
+  nsAutoCString result;
+  nsAutoCString hypoValue;
+  nsAutoString errorMsg;
+  SpeechRecognitionErrorCode errorCode;
+
+  SR_LOG("STT Result: %s", mBuf.get());
+
+  if (NS_FAILED(aStatusCode)) {
+    success = false;
+    errorMsg.Assign(NS_LITERAL_STRING("Error connecting to the service."));
+    errorCode = SpeechRecognitionErrorCode::Network;
+  } else {
+    success = true;
+    UniquePtr<Json::CharReader> const reader(builder.newCharReader());
+    parsingSuccessful =
+        reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr);
+    if (!parsingSuccessful) {
+      // there's an internal server error
+      success = false;
+      errorMsg.Assign(NS_LITERAL_STRING("Internal server error"));
+      errorCode = SpeechRecognitionErrorCode::Network;
+    } else {
+      result.Assign(root.get("status", "error").asString().c_str());
+      if (result.EqualsLiteral("ok")) {
+        // ok, we have a result
+        if (!root["data"].empty()) {
+          hypoValue.Assign(root["data"][0].get("text", "").asString().c_str());
+          confidence = root["data"][0].get("confidence", "0").asFloat();
+        } else {
+          success = false;
+          errorMsg.Assign(NS_LITERAL_STRING("Error reading result data."));
+          errorCode = SpeechRecognitionErrorCode::Network;
+        }
+      } else {
+        success = false;
+        NS_ConvertUTF8toUTF16 error(root.get("message", "").asString().c_str());
+        errorMsg.Assign(error);
+        errorCode = SpeechRecognitionErrorCode::No_speech;
+      }
+    }
+  }
+
+  if (!success) {
+    mRecognition->DispatchError(
+        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg);
+  } else {
+    // Declare javascript result events
+    RefPtr<SpeechEvent> event = new SpeechEvent(
+        mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
+    SpeechRecognitionResultList* resultList =
+        new SpeechRecognitionResultList(mRecognition);
+    SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
+
+    if (mRecognition->MaxAlternatives() > 0) {
+      SpeechRecognitionAlternative* alternative =
+          new SpeechRecognitionAlternative(mRecognition);
+
+      alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue);
+      alternative->mConfidence = confidence;
+
+      result->mItems.AppendElement(alternative);
+    }
+    resultList->mItems.AppendElement(result);
+
+    event->mRecognitionResultList = resultList;
+    NS_DispatchToMainThread(event);
+  }
+
+  return NS_OK;
+}
+
+OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default;
+OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default;
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::Initialize(
+    WeakPtr<SpeechRecognition> aSpeechRecognition) {
+  MOZ_ASSERT(NS_IsMainThread());
+  mWriter = MakeUnique<OggWriter>();
+  mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>(
+      "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition);
+  mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding();
+  MOZ_ASSERT(mEncodeTaskQueue);
+  return NS_OK;
+}
+
+void OnlineSpeechRecognitionService::EncoderDataAvailable() {
+  MOZ_ASSERT(!NS_IsMainThread());
+  nsresult rv;
+  AutoTArray<RefPtr<EncodedFrame>, 4> container;
+  rv = mAudioEncoder->GetEncodedTrack(container);
+  if (NS_WARN_IF(NS_FAILED(rv))) {
+    MOZ_ASSERT_UNREACHABLE();
+  }
+
+  rv = mWriter->WriteEncodedTrack(
+      container,
+      mAudioEncoder->IsEncodingComplete() ? ContainerWriter::END_OF_STREAM : 0);
+  if (NS_WARN_IF(NS_FAILED(rv))) {
+    MOZ_ASSERT_UNREACHABLE();
+  }
+
+  mWriter->GetContainerData(&mEncodedData, mAudioEncoder->IsEncodingComplete()
+                                               ? ContainerWriter::FLUSH_NEEDED
+                                               : 0);
+
+  if (mAudioEncoder->IsEncodingComplete()) {
+    NS_DispatchToMainThread(
+        NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this,
+                          &OnlineSpeechRecognitionService::DoSTT));
+  }
+}
+
+void OnlineSpeechRecognitionService::EncoderInitialized() {
+  MOZ_ASSERT(!NS_IsMainThread());
+  AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata;
+  metadata.AppendElement(mAudioEncoder->GetMetadata());
+  if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) {
+    SR_LOG("wrong meta data type!");
+    MOZ_ASSERT_UNREACHABLE();
+  }
+
+  nsresult rv = mWriter->SetMetadata(metadata);
+  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+
+  rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER);
+  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+}
+
+void OnlineSpeechRecognitionService::EncoderError() {
+  MOZ_ASSERT(!NS_IsMainThread());
+  SR_LOG("Error encoding frames.");
+  mEncodedData.Clear();
+  NS_DispatchToMainThread(NS_NewRunnableFunction(
+      "SpeechRecognition::DispatchError",
+      [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+        if (!mRecognition) {
+          return;
+        }
+        mRecognition->DispatchError(
+            SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+            SpeechRecognitionErrorCode::Audio_capture,
+            NS_LITERAL_STRING("Encoder error"));
+      }));
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
+                                                    int32_t aSampleRate) {
+  MOZ_ASSERT(!NS_IsMainThread());
+  int64_t duration = aAudioSegment->GetDuration();
+  if (duration <= 0) {
+    return NS_OK;
+  }
+
+  if (!mAudioEncoder) {
+    mSpeechEncoderListener = new SpeechEncoderListener(this);
+    mAudioEncoder = MakeAndAddRef<OpusTrackEncoder>(aSampleRate);
+    RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent();
+    mAudioEncoder->SetWorkerThread(mEncoderThread);
+    mAudioEncoder->RegisterListener(mSpeechEncoderListener);
+  }
+
+  mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment));
+
+  TimeStamp now = TimeStamp::Now();
+  if (mFirstIteration.IsNull()) {
+    mFirstIteration = now;
+  }
+
+  if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) {
+    NS_DispatchToMainThread(NS_NewRunnableFunction(
+        "SpeechRecognition::Stop",
+        [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+          if (!mRecognition) {
+            return;
+          }
+          mRecognition->Stop();
+        }));
+
+    return NS_OK;
+  }
+
+  return NS_OK;
+}
+
+void OnlineSpeechRecognitionService::DoSTT() {
+  MOZ_ASSERT(NS_IsMainThread());
+
+  if (mAborted) {
+    return;
+  }
+
+  nsresult rv;
+  nsCOMPtr<nsIChannel> chan;
+  nsCOMPtr<nsIURI> uri;
+  nsAutoCString speechRecognitionEndpoint;
+  nsAutoCString prefEndpoint;
+  nsAutoString language;
+
+  Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT,
+                          prefEndpoint);
+
+  if (!prefEndpoint.IsEmpty()) {
+    speechRecognitionEndpoint = prefEndpoint;
+  } else {
+    speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT;
+  }
+
+  rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr,
+                 nullptr);
+  if (NS_WARN_IF(NS_FAILED(rv))) {
+    mRecognition->DispatchError(
+        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+        SpeechRecognitionErrorCode::Network, NS_LITERAL_STRING("Unknown URI"));
+    return;
+  }
+
+  nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_DATA_INHERITS;
+  nsLoadFlags loadFlags =
+      nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER;
+  nsContentPolicyType contentPolicy =
+      nsContentUtils::InternalContentPolicyTypeToExternal(
+          nsIContentPolicy::TYPE_OTHER);
+
+  nsPIDOMWindowInner* window = mRecognition->GetOwner();
+  if (NS_WARN_IF(!window)) {
+    mRecognition->DispatchError(
+        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+        SpeechRecognitionErrorCode::Aborted, NS_LITERAL_STRING("No window"));
+    return;
+  }
+
+  Document* doc = window->GetExtantDoc();
+  if (NS_WARN_IF(!doc)) {
+    mRecognition->DispatchError(
+        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+        SpeechRecognitionErrorCode::Aborted, NS_LITERAL_STRING("No document"));
+  }
+  rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags,
+                     contentPolicy, nullptr, nullptr, nullptr, nullptr,
+                     loadFlags);
+  if (NS_WARN_IF(NS_FAILED(rv))) {
+    mRecognition->DispatchError(
+        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+        SpeechRecognitionErrorCode::Network,
+        NS_LITERAL_STRING("Failed to open channel"));
+    return;
+  }
+
+  nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan);
+  if (httpChan) {
+    rv = httpChan->SetRequestMethod(NS_LITERAL_CSTRING("POST"));
+    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+  }
+
+  if (httpChan) {
+    mRecognition->GetLang(language);
+    // Accept-Language-STT is a custom header of our backend server used to set
+    // the language of the speech sample being submitted by the client
+    rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Accept-Language-STT"),
+                                    NS_ConvertUTF16toUTF8(language), false);
+    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+    // Tell the server to not store the transcription by default
+    rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Store-Transcription"),
+                                    NS_LITERAL_CSTRING("0"), false);
+    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+    // Tell the server to not store the sample by default
+    rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Store-Sample"),
+                                    NS_LITERAL_CSTRING("0"), false);
+    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+    // Set the product tag as teh web speech api
+    rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Product-Tag"),
+                                    NS_LITERAL_CSTRING("wsa"), false);
+    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+  }
+
+  nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan));
+  if (cos) {
+    cos->AddClassFlags(nsIClassOfService::UrgentStart);
+  }
+
+  nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan);
+  if (uploadChan) {
+    nsCOMPtr<nsIInputStream> bodyStream;
+    uint32_t length = 0;
+    for (const nsTArray<uint8_t>& chunk : mEncodedData) {
+      length += chunk.Length();
+    }
+
+    nsTArray<uint8_t> audio;
+    if (!audio.SetCapacity(length, fallible)) {
+      mRecognition->DispatchError(
+          SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+          SpeechRecognitionErrorCode::Audio_capture,
+          NS_LITERAL_STRING("Allocation error"));
+      return;
+    }
+
+    for (const nsTArray<uint8_t>& chunk : mEncodedData) {
+      audio.AppendElements(chunk);
+    }
+
+    mEncodedData.Clear();
+
+    rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio));
+    if (NS_WARN_IF(NS_FAILED(rv))) {
+      mRecognition->DispatchError(
+          SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+          SpeechRecognitionErrorCode::Network,
+          NS_LITERAL_STRING("Failed to open stream"));
+      return;
+    }
+    if (bodyStream) {
+      rv = uploadChan->ExplicitSetUploadStream(
+          bodyStream, NS_LITERAL_CSTRING("audio/ogg"), length,
+          NS_LITERAL_CSTRING("POST"), false);
+      MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
+    }
+  }
+
+  rv = chan->AsyncOpen(this);
+  if (NS_WARN_IF(NS_FAILED(rv))) {
+    mRecognition->DispatchError(
+        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
+        SpeechRecognitionErrorCode::Network,
+        NS_LITERAL_STRING("Internal server error"));
+  }
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::SoundEnd() {
+  MOZ_ASSERT(NS_IsMainThread());
+
+  if (!mEncodeTaskQueue) {
+    // Not initialized
+    return NS_OK;
+  }
+
+  nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction(
+      "OnlineSpeechRecognitionService::SoundEnd",
+      [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
+        if (mAudioEncoder) {
+          mAudioEncoder->NotifyEndOfStream();
+          mAudioEncoder->UnregisterListener(mSpeechEncoderListener);
+          mSpeechEncoderListener = nullptr;
+          mAudioEncoder = nullptr;
+        }
+      }));
+  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+  Unused << rv;
+
+  mEncodeTaskQueue = nullptr;
+
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::ValidateAndSetGrammarList(
+    SpeechGrammar* aSpeechGrammar,
+    nsISpeechGrammarCompilationCallback* aCallback) {
+  // This is an online LVCSR (STT) service,
+  // so we don't need to set a grammar
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+OnlineSpeechRecognitionService::Abort() {
+  MOZ_ASSERT(NS_IsMainThread());
+  if (mAborted) {
+    return NS_OK;
+  }
+  mAborted = true;
+  return SoundEnd();
+}
+}  // namespace mozilla
--- a/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h
+++ b/dom/media/webspeech/recognition/OnlineSpeechRecognitionService.h
@ -0,0 +1,133 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_OnlineRecognitionService_h
+#define mozilla_dom_OnlineRecognitionService_h
+
+#include "nsCOMPtr.h"
+#include "nsTArray.h"
+#include "nsISpeechRecognitionService.h"
+#include "speex/speex_resampler.h"
+#include "nsIStreamListener.h"
+#include "OpusTrackEncoder.h"
+#include "ContainerWriter.h"
+
+#define NS_ONLINE_SPEECH_RECOGNITION_SERVICE_CID \
+  {0x0ff5ce56,                                   \
+   0x5b09,                                       \
+   0x4db8,                                       \
+   {0xad, 0xc6, 0x82, 0x66, 0xaf, 0x95, 0xf8, 0x64}};
+
+namespace mozilla {
+
+namespace ipc {
+class PrincipalInfo;
+}  // namespace ipc
+
+/**
+ * Online implementation of the nsISpeechRecognitionService interface
+ */
+class OnlineSpeechRecognitionService : public nsISpeechRecognitionService,
+                                       public nsIStreamListener {
+ public:
+  // Add XPCOM glue code
+  NS_DECL_THREADSAFE_ISUPPORTS
+  NS_DECL_NSISPEECHRECOGNITIONSERVICE
+  NS_DECL_NSIREQUESTOBSERVER
+  NS_DECL_NSISTREAMLISTENER
+
+  /**
+   * Listener responsible for handling the events raised by the TrackEncoder
+   */
+  class SpeechEncoderListener : public TrackEncoderListener {
+   public:
+    explicit SpeechEncoderListener(OnlineSpeechRecognitionService* aService)
+        : mService(aService), mOwningThread(AbstractThread::GetCurrent()) {}
+
+    void Initialized(TrackEncoder* aEncoder) override {
+      MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
+      mService->EncoderInitialized();
+    }
+
+    void DataAvailable(TrackEncoder* aEncoder) override {
+      MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
+      mService->EncoderDataAvailable();
+    }
+
+    void Error(TrackEncoder* aEncoder) override {
+      MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
+      mService->EncoderError();
+    }
+
+   private:
+    const RefPtr<OnlineSpeechRecognitionService> mService;
+    const RefPtr<AbstractThread> mOwningThread;
+  };
+
+  /**
+   * Default constructs a OnlineSpeechRecognitionService
+   */
+  OnlineSpeechRecognitionService();
+
+  /**
+   * Called by SpeechEncoderListener when the AudioTrackEncoder has been
+   * initialized.
+   */
+  void EncoderInitialized();
+
+  /**
+   * Called by SpeechEncoderListener when the AudioTrackEncoder has encoded
+   * some data for us to pass along.
+   */
+  void EncoderDataAvailable();
+
+  /**
+   * Called by SpeechEncoderListener when the AudioTrackEncoder has
+   * encountered an error.
+   */
+  void EncoderError();
+
+ private:
+  /**
+   * Private destructor to prevent bypassing of reference counting
+   */
+  virtual ~OnlineSpeechRecognitionService();
+
+  /** The associated SpeechRecognition */
+  nsMainThreadPtrHandle<dom::SpeechRecognition> mRecognition;
+
+  /**
+   * Builds a mock SpeechRecognitionResultList
+   */
+  dom::SpeechRecognitionResultList* BuildMockResultList();
+
+  /**
+   * Method responsible for uploading the audio to the remote endpoint
+   */
+  void DoSTT();
+
+  // Encoded and packaged ogg audio data
+  nsTArray<nsTArray<uint8_t>> mEncodedData;
+  // Member responsible for holding a reference to the TrackEncoderListener
+  RefPtr<SpeechEncoderListener> mSpeechEncoderListener;
+  // Encoder responsible for encoding the frames from pcm to opus which is the
+  // format supported by our backend
+  RefPtr<AudioTrackEncoder> mAudioEncoder;
+  // Object responsible for wrapping the opus frames into an ogg container
+  UniquePtr<ContainerWriter> mWriter;
+  // Member responsible for storing the json string returned by the endpoint
+  nsCString mBuf;
+  // Used to calculate a ceiling on the time spent listening.
+  TimeStamp mFirstIteration;
+  // flag responsible to control if the user choose to abort
+  bool mAborted = false;
+  //  reference to the audio encoder queue
+  RefPtr<TaskQueue> mEncodeTaskQueue;
+};
+
+}  // namespace mozilla
+
+#endif
--- a/dom/media/webspeech/recognition/SpeechGrammar.h
+++ b/dom/media/webspeech/recognition/SpeechGrammar.h
@ -36,6 +36,11 @@ class SpeechGrammar final : public nsISupports, public nsWrapperCache {
  static already_AddRefed<SpeechGrammar> Constructor(
      const GlobalObject& aGlobal);

+  static already_AddRefed<SpeechGrammar> WebkitSpeechGrammar(
+      const GlobalObject& aGlobal, ErrorResult& aRv) {
+    return Constructor(aGlobal);
+  }
+
  void GetSrc(nsString& aRetVal, ErrorResult& aRv) const;

  void SetSrc(const nsAString& aArg, ErrorResult& aRv);
--- a/dom/media/webspeech/recognition/SpeechGrammarList.h
+++ b/dom/media/webspeech/recognition/SpeechGrammarList.h
@ -35,6 +35,11 @@ class SpeechGrammarList final : public nsISupports, public nsWrapperCache {
  static already_AddRefed<SpeechGrammarList> Constructor(
      const GlobalObject& aGlobal);

+  static already_AddRefed<SpeechGrammarList> WebkitSpeechGrammarList(
+      const GlobalObject& aGlobal, ErrorResult& aRv) {
+    return Constructor(aGlobal);
+  }
+
  nsISupports* GetParentObject() const;

  JSObject* WrapObject(JSContext* aCx,
--- a/dom/media/webspeech/recognition/SpeechRecognition.cpp
+++ b/dom/media/webspeech/recognition/SpeechRecognition.cpp
@ -19,7 +19,8 @@
 #include "mozilla/Preferences.h"
 #include "mozilla/Services.h"
 #include "mozilla/StaticPrefs_media.h"
-
+#include "mozilla/AbstractThread.h"
+#include "VideoUtils.h"
 #include "AudioSegment.h"
 #include "MediaEnginePrefs.h"
 #include "endpointer.h"
@ -46,17 +47,17 @@ namespace mozilla {
 namespace dom {

 #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
-#define DEFAULT_RECOGNITION_SERVICE_PREFIX "pocketsphinx-"
-#define DEFAULT_RECOGNITION_SERVICE "pocketsphinx-en-US"
+#define DEFAULT_RECOGNITION_SERVICE "online"

 #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
 #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \
  "media.webspeech.long_silence_length"
 #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \
  "media.webspeech.long_speech_length"
+#define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \
+  "media.webspeech.recognition.timeout"

 static const uint32_t kSAMPLE_RATE = 16000;
-static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;

 // number of frames corresponding to 300ms of audio to send to endpointer while
 // it's in environment estimation mode
@ -70,7 +71,31 @@ LogModule* GetSpeechRecognitionLog() {
 #define SR_LOG(...) \
  MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))

-already_AddRefed<nsISpeechRecognitionService> GetSpeechRecognitionService(
+namespace {
+class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker {
+ public:
+  SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition,
+                                   const nsString& aName)
+      : media::ShutdownBlocker(aName), mRecognition(aRecognition) {}
+
+  NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override {
+    MOZ_ASSERT(NS_IsMainThread());
+    // AbortSilently will eventually clear the blocker.
+    mRecognition->Abort();
+    return NS_OK;
+  }
+
+ private:
+  const RefPtr<SpeechRecognition> mRecognition;
+};
+
+enum class ServiceCreationError {
+  ServiceNotFound,
+};
+
+Result<nsCOMPtr<nsISpeechRecognitionService>, ServiceCreationError>
+CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow,
+                               SpeechRecognition* aRecognition,
                               const nsAString& aLang) {
  nsAutoCString speechRecognitionServiceCID;

@ -78,11 +103,7 @@ already_AddRefed<nsISpeechRecognitionService> GetSpeechRecognitionService(
  Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue);
  nsAutoCString speechRecognitionService;

-  if (!aLang.IsEmpty()) {
-    speechRecognitionService =
-        NS_LITERAL_CSTRING(DEFAULT_RECOGNITION_SERVICE_PREFIX) +
-        NS_ConvertUTF16toUTF8(aLang);
-  } else if (!prefValue.IsEmpty()) {
+  if (!prefValue.IsEmpty()) {
    speechRecognitionService = prefValue;
  } else {
    speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
@ -99,27 +120,15 @@ already_AddRefed<nsISpeechRecognitionService> GetSpeechRecognitionService(

  nsresult rv;
  nsCOMPtr<nsISpeechRecognitionService> recognitionService;
-  recognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
-  return recognitionService.forget();
+  recognitionService =
+      do_CreateInstance(speechRecognitionServiceCID.get(), &rv);
+  if (!recognitionService) {
+    return Err(ServiceCreationError::ServiceNotFound);
  }

-class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker {
- public:
-  explicit SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition)
-      : media::ShutdownBlocker(NS_LITERAL_STRING("SpeechRecognition shutdown")),
-        mRecognition(aRecognition) {}
-
-  NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override {
-    MOZ_ASSERT(NS_IsMainThread());
-
-    // AbortSilently will eventually clear the blocker.
-    mRecognition->Abort();
-    return NS_OK;
+  return recognitionService;
 }
-
- private:
-  const RefPtr<SpeechRecognition> mRecognition;
-};
+}  // namespace

 NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition, DOMEventTargetHelper,
                                   mStream, mTrack, mRecognitionService,
@ -137,7 +146,8 @@ SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
      mEndpointer(kSAMPLE_RATE),
      mAudioSamplesPerChunk(mEndpointer.FrameSize()),
      mSpeechDetectionTimer(NS_NewTimer()),
-      mSpeechGrammarList(new SpeechGrammarList(GetParentObject())),
+      mSpeechGrammarList(new SpeechGrammarList(GetOwner())),
+      mContinuous(false),
      mInterimResults(false),
      mMaxAlternatives(1) {
  SR_LOG("created SpeechRecognition");
@ -154,6 +164,10 @@ SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
      Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000));
  mEndpointer.set_long_speech_length(
      Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
+
+  mSpeechDetectionTimeoutMs =
+      Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000);
+
  Reset();
 }

@ -211,8 +225,6 @@ already_AddRefed<SpeechRecognition> SpeechRecognition::Constructor(
  return object.forget();
 }

-nsISupports* SpeechRecognition::GetParentObject() const { return GetOwner(); }
-
 void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) {
  SR_LOG("Processing %s, current state is %s", GetName(aEvent),
         GetName(mCurrentState));
@ -245,8 +257,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
        case EVENT_RECOGNITIONSERVICE_ERROR:
          AbortError(aEvent);
          break;
-        case EVENT_COUNT:
-          MOZ_CRASH("Invalid event EVENT_COUNT");
+        default:
+          MOZ_CRASH("Invalid event");
      }
      break;
    case STATE_STARTING:
@ -262,7 +274,7 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
          AbortSilently(aEvent);
          break;
        case EVENT_STOP:
-          Reset();
+          ResetAndEnd();
          break;
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
@ -271,8 +283,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
        case EVENT_START:
          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
          MOZ_CRASH();
-        case EVENT_COUNT:
-          MOZ_CRASH("Invalid event EVENT_COUNT");
+        default:
+          MOZ_CRASH("Invalid event");
      }
      break;
    case STATE_ESTIMATING:
@ -297,8 +309,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
        case EVENT_START:
          SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
          MOZ_CRASH();
-        case EVENT_COUNT:
-          MOZ_CRASH("Invalid event EVENT_COUNT");
+        default:
+          MOZ_CRASH("Invalid event");
      }
      break;
    case STATE_WAITING_FOR_SPEECH:
@ -323,8 +335,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
        case EVENT_START:
          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
          MOZ_CRASH();
-        case EVENT_COUNT:
-          MOZ_CRASH("Invalid event EVENT_COUNT");
+        default:
+          MOZ_CRASH("Invalid event");
      }
      break;
    case STATE_RECOGNIZING:
@ -349,8 +361,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
        case EVENT_START:
          SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
          MOZ_CRASH();
-        case EVENT_COUNT:
-          MOZ_CRASH("Invalid event EVENT_COUNT");
+        default:
+          MOZ_CRASH("Invalid event");
      }
      break;
    case STATE_WAITING_FOR_RESULT:
@ -376,12 +388,30 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
          SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s",
                 GetName(aEvent));
          MOZ_CRASH();
-        case EVENT_COUNT:
-          MOZ_CRASH("Invalid event EVENT_COUNT");
+        default:
+          MOZ_CRASH("Invalid event");
      }
      break;
-    case STATE_COUNT:
-      MOZ_CRASH("Invalid state STATE_COUNT");
+    case STATE_ABORTING:
+      switch (aEvent->mType) {
+        case EVENT_STOP:
+        case EVENT_ABORT:
+        case EVENT_AUDIO_DATA:
+        case EVENT_AUDIO_ERROR:
+        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
+        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
+        case EVENT_RECOGNITIONSERVICE_ERROR:
+          DoNothing(aEvent);
+          break;
+        case EVENT_START:
+          SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent));
+          MOZ_CRASH();
+        default:
+          MOZ_CRASH("Invalid event");
+      }
+      break;
+    default:
+      MOZ_CRASH("Invalid state");
  }
 }

@ -400,7 +430,17 @@ uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment,
    iterator.Next();
  }

-  mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate);
+  // we need to call the nsISpeechRecognitionService::ProcessAudioSegment
+  // in a separate thread so that any eventual encoding or pre-processing
+  // of the audio does not block the main thread
+  nsresult rv = mEncodeTaskQueue->Dispatch(
+      NewRunnableMethod<StoreCopyPassByPtr<AudioSegment>, TrackRate>(
+          "nsISpeechRecognitionService::ProcessAudioSegment",
+          mRecognitionService,
+          &nsISpeechRecognitionService::ProcessAudioSegment,
+          std::move(*aSegment), aTrackRate));
+  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
+  Unused << rv;
  return samples;
 }

@ -421,7 +461,19 @@ uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment,

 void SpeechRecognition::Reset() {
  SetState(STATE_IDLE);
+
+  // This breaks potential ref-cycles.
  mRecognitionService = nullptr;
+
+  ++mStreamGeneration;
+  if (mStream) {
+    mStream->UnregisterTrackListener(this);
+    mStream = nullptr;
+  }
+  mTrack = nullptr;
+  mTrackIsOwned = false;
+  mStopRecordingPromise = nullptr;
+  mEncodeTaskQueue = nullptr;
  mEstimationSamples = 0;
  mBufferedSamples = 0;
  mSpeechDetectionTimer->Cancel();
@ -454,7 +506,12 @@ void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) {
  SetState(STATE_WAITING_FOR_RESULT);

  MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
-  mRecognitionService->SoundEnd();
+
+  // This will run SoundEnd on the service just before StopRecording begins
+  // shutting the encode thread down.
+  mSpeechListener->mRemovedPromise->Then(
+      GetCurrentThreadSerialEventTarget(), __func__,
+      [service = mRecognitionService] { service->SoundEnd(); });

  StopRecording();
 }
@ -518,14 +575,23 @@ void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {}

 void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) {
  if (mRecognitionService) {
+    if (mTrack) {
+      // This will run Abort on the service just before StopRecording begins
+      // shutting the encode thread down.
+      mSpeechListener->mRemovedPromise->Then(
+          GetCurrentThreadSerialEventTarget(), __func__,
+          [service = mRecognitionService] { service->Abort(); });
+    } else {
+      // Recording hasn't started yet. We can just call Abort().
      mRecognitionService->Abort();
    }
-
-  if (mTrack) {
-    StopRecording();
  }

-  ResetAndEnd();
+  StopRecording()->Then(
+      GetCurrentThreadSerialEventTarget(), __func__,
+      [self = RefPtr<SpeechRecognition>(this), this] { ResetAndEnd(); });
+
+  SetState(STATE_ABORTING);
 }

 void SpeechRecognition::AbortError(SpeechEvent* aEvent) {
@ -544,54 +610,83 @@ void SpeechRecognition::NotifyError(SpeechEvent* aEvent) {
 **************************************/
 NS_IMETHODIMP
 SpeechRecognition::StartRecording(RefPtr<AudioStreamTrack>& aTrack) {
-  // hold a reference so that the underlying track
-  // doesn't get Destroy()'ed
+  // hold a reference so that the underlying track doesn't get collected.
  mTrack = aTrack;
+  MOZ_ASSERT(!mTrack->Ended());

-  if (NS_WARN_IF(mTrack->Ended())) {
-    return NS_ERROR_UNEXPECTED;
-  }
  mSpeechListener = new SpeechTrackListener(this);
  mTrack->AddListener(mSpeechListener);

-  mShutdownBlocker = MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this);
+  nsString blockerName;
+  blockerName.AppendPrintf("SpeechRecognition %p shutdown", this);
+  mShutdownBlocker =
+      MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this, blockerName);
  RefPtr<nsIAsyncShutdownClient> shutdown = media::GetShutdownBarrier();
  shutdown->AddBlocker(mShutdownBlocker, NS_LITERAL_STRING(__FILE__), __LINE__,
                       NS_LITERAL_STRING("SpeechRecognition shutdown"));

  mEndpointer.StartSession();

-  return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
+  return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs,
                                     nsITimer::TYPE_ONE_SHOT);
 }

-NS_IMETHODIMP
-SpeechRecognition::StopRecording() {
-  if (mShutdownBlocker) {
-    // Block shutdown until the speech track listener has been removed from the
-    // MTG, as it holds a reference to us, and we reference the world, which we
-    // don't want to leak.
-    mSpeechListener->mRemovedPromise->Then(
-        GetCurrentThreadSerialEventTarget(), __func__,
-        [blocker = std::move(mShutdownBlocker)] {
-          RefPtr<nsIAsyncShutdownClient> shutdown = media::GetShutdownBarrier();
-          nsresult rv = shutdown->RemoveBlocker(blocker);
-          MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
-          Unused << rv;
-        });
-  }
-  MOZ_ASSERT(!mShutdownBlocker);
-
+RefPtr<GenericNonExclusivePromise> SpeechRecognition::StopRecording() {
+  if (!mTrack) {
+    // Recording wasn't started, or has already been stopped.
+    if (mStream) {
+      // Ensure we don't start recording because a track became available
+      // before we get reset.
      mStream->UnregisterTrackListener(this);
+    }
+    return GenericNonExclusivePromise::CreateAndResolve(true, __func__);
+  }
+
+  if (mStopRecordingPromise) {
+    return mStopRecordingPromise;
+  }
+
  mTrack->RemoveListener(mSpeechListener);
-  mStream = nullptr;
-  mSpeechListener = nullptr;
-  mTrack = nullptr;
+  if (mTrackIsOwned) {
+    mTrack->Stop();
+  }

  mEndpointer.EndSession();
  DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));

-  return NS_OK;
+  // Block shutdown until the speech track listener has been removed from the
+  // MSG, as it holds a reference to us, and we reference the world, which we
+  // don't want to leak.
+  mStopRecordingPromise =
+      mSpeechListener->mRemovedPromise
+          ->Then(
+              GetCurrentThreadSerialEventTarget(), __func__,
+              [self = RefPtr<SpeechRecognition>(this), this] {
+                SR_LOG("Shutting down encoding thread");
+                return mEncodeTaskQueue->BeginShutdown();
+              },
+              [] {
+                MOZ_CRASH("Unexpected rejection");
+                return ShutdownPromise::CreateAndResolve(false, __func__);
+              })
+          ->Then(
+              GetCurrentThreadSerialEventTarget(), __func__,
+              [self = RefPtr<SpeechRecognition>(this), this] {
+                RefPtr<nsIAsyncShutdownClient> shutdown =
+                    media::GetShutdownBarrier();
+                shutdown->RemoveBlocker(mShutdownBlocker);
+                mShutdownBlocker = nullptr;
+
+                MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE);
+                return GenericNonExclusivePromise::CreateAndResolve(true,
+                                                                    __func__);
+              },
+              [] {
+                MOZ_CRASH("Unexpected rejection");
+                return GenericNonExclusivePromise::CreateAndResolve(false,
+                                                                    __func__);
+              });
+  return mStopRecordingPromise;
 }

 NS_IMETHODIMP
@ -648,12 +743,11 @@ void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; }
 void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; }

 bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const {
-  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
-  return false;
+  return mContinuous;
 }

 void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) {
-  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
+  mContinuous = aArg;
 }

 bool SpeechRecognition::InterimResults() const { return mInterimResults; }
@ -690,6 +784,10 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
    return;
  }

+  mEncodeTaskQueue = MakeAndAddRef<TaskQueue>(
+      GetMediaThreadPool(MediaThreadType::WEBRTC_DECODER),
+      "WebSpeechEncoderThread");
+
  nsresult rv;
  rv = mRecognitionService->Initialize(this);
  if (NS_WARN_IF(NS_FAILED(rv))) {
@ -701,6 +799,7 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,

  if (aStream.WasPassed()) {
    mStream = &aStream.Value();
+    mTrackIsOwned = false;
    mStream->RegisterTrackListener(this);
    nsTArray<RefPtr<AudioStreamTrack>> tracks;
    mStream->GetAudioTracks(tracks);
@ -711,24 +810,40 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
      }
    }
  } else {
+    mTrackIsOwned = true;
    AutoNoJSAPI nojsapi;
    RefPtr<SpeechRecognition> self(this);
    MediaManager::Get()
        ->GetUserMedia(GetOwner(), constraints, aCallerType)
        ->Then(
            GetCurrentThreadSerialEventTarget(), __func__,
-            [this, self](RefPtr<DOMMediaStream>&& aStream) {
+            [this, self,
+             generation = mStreamGeneration](RefPtr<DOMMediaStream>&& aStream) {
+              nsTArray<RefPtr<AudioStreamTrack>> tracks;
+              aStream->GetAudioTracks(tracks);
+              if (mAborted || mCurrentState != STATE_STARTING ||
+                  mStreamGeneration != generation) {
+                // We were probably aborted. Exit early.
+                for (const RefPtr<AudioStreamTrack>& track : tracks) {
+                  track->Stop();
+                }
+                return;
+              }
              mStream = std::move(aStream);
              mStream->RegisterTrackListener(this);
-              nsTArray<RefPtr<AudioStreamTrack>> tracks;
-              mStream->GetAudioTracks(tracks);
              for (const RefPtr<AudioStreamTrack>& track : tracks) {
                if (!track->Ended()) {
                  NotifyTrackAdded(track);
                }
              }
            },
-            [this, self](RefPtr<MediaMgrError>&& error) {
+            [this, self,
+             generation = mStreamGeneration](RefPtr<MediaMgrError>&& error) {
+              if (mAborted || mCurrentState != STATE_STARTING ||
+                  mStreamGeneration != generation) {
+                // We were probably aborted. Exit early.
+                return;
+              }
              SpeechRecognitionErrorCode errorCode;

              if (error->mName == MediaMgrError::Name::NotAllowedError) {
@ -746,25 +861,18 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
 }

 bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) {
+  if (!GetOwner()) {
+    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+    return false;
+  }
+
  // See:
  // https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang
+  nsAutoString lang;
  if (!mLang.IsEmpty()) {
-    mRecognitionService = GetSpeechRecognitionService(mLang);
-
-    if (!mRecognitionService) {
-      aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
-      return false;
-    }
-
-    return true;
-  }
-
-  nsCOMPtr<nsPIDOMWindowInner> window = GetOwner();
-  if (!window) {
-    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
-    return false;
-  }
-  nsCOMPtr<Document> document = window->GetExtantDoc();
+    lang = mLang;
+  } else {
+    nsCOMPtr<Document> document = GetOwner()->GetExtantDoc();
    if (!document) {
      aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
      return false;
@ -777,13 +885,23 @@ bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) {

    nsAutoString lang;
    element->GetLang(lang);
-  mRecognitionService = GetSpeechRecognitionService(lang);
+  }

-  if (!mRecognitionService) {
+  auto result = CreateSpeechRecognitionService(GetOwner(), this, lang);
+
+  if (result.isErr()) {
+    switch (result.unwrapErr()) {
+      case ServiceCreationError::ServiceNotFound:
        aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
+        break;
+      default:
+        MOZ_CRASH("Unknown error");
+    }
    return false;
  }

+  mRecognitionService = result.unwrap();
+  MOZ_DIAGNOSTIC_ASSERT(mRecognitionService);
  return true;
 }

@ -794,11 +912,6 @@ bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) {
  }

  uint32_t grammarListLength = mSpeechGrammarList->Length();
-  if (0 == grammarListLength) {
-    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
-    return false;
-  }
-
  for (uint32_t count = 0; count < grammarListLength; ++count) {
    RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv);
    if (aRv.Failed()) {
@ -825,6 +938,7 @@ void SpeechRecognition::Abort() {
  }

  mAborted = true;
+
  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
  NS_DispatchToMainThread(event);
 }
@ -874,14 +988,13 @@ void SpeechRecognition::DispatchError(EventType aErrorType,
 uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
                                              uint32_t aSampleCount) {
  MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
-  MOZ_ASSERT(mAudioSamplesBuffer.get());
+  MOZ_ASSERT(mAudioSamplesBuffer);

  int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
  size_t samplesToCopy =
      std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples);

-  memcpy(samplesBuffer + mBufferedSamples, aSamples,
-         samplesToCopy * sizeof(int16_t));
+  PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy);

  mBufferedSamples += samplesToCopy;
  return samplesToCopy;
@ -903,8 +1016,8 @@ uint32_t SpeechRecognition::SplitSamplesBuffer(
    RefPtr<SharedBuffer> chunk =
        SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));

-    memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
-           mAudioSamplesPerChunk * sizeof(int16_t));
+    PodCopy(static_cast<short*>(chunk->Data()), aSamplesBuffer + chunkStart,
+            mAudioSamplesPerChunk);

    aResult.AppendElement(chunk.forget());
    chunkStart += mAudioSamplesPerChunk;
@ -987,6 +1100,7 @@ const char* SpeechRecognition::GetName(FSMState aId) {
      "STATE_IDLE",        "STATE_STARTING",
      "STATE_ESTIMATING",  "STATE_WAITING_FOR_SPEECH",
      "STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT",
+      "STATE_ABORTING",
  };

  MOZ_ASSERT(aId < STATE_COUNT);
@ -1009,6 +1123,11 @@ const char* SpeechRecognition::GetName(SpeechEvent* aEvent) {
  return names[aEvent->mType];
 }

+TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const {
+  MOZ_ASSERT(NS_IsMainThread());
+  return mEncodeTaskQueue;
+}
+
 SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition,
                         SpeechRecognition::EventType aType)
    : Runnable("dom::SpeechEvent"),
--- a/dom/media/webspeech/recognition/SpeechRecognition.h
+++ b/dom/media/webspeech/recognition/SpeechRecognition.h
@ -32,6 +32,10 @@

 namespace mozilla {

+namespace media {
+class ShutdownBlocker;
+}
+
 namespace dom {

 #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \
@ -40,7 +44,6 @@ namespace dom {

 class GlobalObject;
 class AudioStreamTrack;
-class SpeechRecognitionShutdownBlocker;
 class SpeechEvent;
 class SpeechTrackListener;

@ -62,8 +65,6 @@ class SpeechRecognition final : public DOMEventTargetHelper,

  NS_DECL_NSIOBSERVER

-  nsISupports* GetParentObject() const;
-
  JSObject* WrapObject(JSContext* aCx,
                       JS::Handle<JSObject*> aGivenProto) override;

@ -72,6 +73,11 @@ class SpeechRecognition final : public DOMEventTargetHelper,
  static already_AddRefed<SpeechRecognition> Constructor(
      const GlobalObject& aGlobal, ErrorResult& aRv);

+  static already_AddRefed<SpeechRecognition> WebkitSpeechRecognition(
+      const GlobalObject& aGlobal, ErrorResult& aRv) {
+    return Constructor(aGlobal, aRv);
+  }
+
  already_AddRefed<SpeechGrammarList> Grammars() const;

  void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
@ -90,6 +96,8 @@ class SpeechRecognition final : public DOMEventTargetHelper,

  uint32_t MaxAlternatives() const;

+  TaskQueue* GetTaskQueueForEncoding() const;
+
  void SetMaxAlternatives(uint32_t aArg);

  void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
@ -153,6 +161,7 @@ class SpeechRecognition final : public DOMEventTargetHelper,
    STATE_WAITING_FOR_SPEECH,
    STATE_RECOGNIZING,
    STATE_WAITING_FOR_RESULT,
+    STATE_ABORTING,
    STATE_COUNT
  };

@ -163,7 +172,7 @@ class SpeechRecognition final : public DOMEventTargetHelper,
  bool ValidateAndSetGrammarList(ErrorResult& aRv);

  NS_IMETHOD StartRecording(RefPtr<AudioStreamTrack>& aDOMStream);
-  NS_IMETHOD StopRecording();
+  RefPtr<GenericNonExclusivePromise> StopRecording();

  uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
  void NotifyError(SpeechEvent* aEvent);
@ -186,9 +195,19 @@ class SpeechRecognition final : public DOMEventTargetHelper,

  RefPtr<DOMMediaStream> mStream;
  RefPtr<AudioStreamTrack> mTrack;
+  bool mTrackIsOwned = false;
+  RefPtr<GenericNonExclusivePromise> mStopRecordingPromise;
  RefPtr<SpeechTrackListener> mSpeechListener;
-  RefPtr<SpeechRecognitionShutdownBlocker> mShutdownBlocker;
  nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
+  RefPtr<media::ShutdownBlocker> mShutdownBlocker;
+  // TaskQueue responsible for pre-processing the samples by the service
+  // it runs in a separate thread from the main thread
+  RefPtr<TaskQueue> mEncodeTaskQueue;
+
+  // A generation ID of the MediaStream a started session is for, so that
+  // a gUM request that resolves after the session has stopped, and a new
+  // one has started, can exit early. Main thread only. Can wrap.
+  uint8_t mStreamGeneration = 0;

  FSMState mCurrentState;

@ -197,6 +216,10 @@ class SpeechRecognition final : public DOMEventTargetHelper,

  uint32_t mAudioSamplesPerChunk;

+  // maximum amount of seconds the engine will wait for voice
+  // until returning a 'no speech detected' error
+  uint32_t mSpeechDetectionTimeoutMs;
+
  // buffer holds one chunk of mAudioSamplesPerChunk
  // samples before feeding it to mEndpointer
  RefPtr<SharedBuffer> mAudioSamplesBuffer;
@ -209,6 +232,10 @@ class SpeechRecognition final : public DOMEventTargetHelper,

  RefPtr<SpeechGrammarList> mSpeechGrammarList;

+  // private flag used to hold if the user called the setContinuous() method
+  // of the API
+  bool mContinuous;
+
  // WebSpeechAPI (http://bit.ly/1gIl7DC) states:
  //
  // 1. Default value MUST be false
--- a/dom/media/webspeech/recognition/moz.build
+++ b/dom/media/webspeech/recognition/moz.build
@ -12,6 +12,7 @@ XPIDL_SOURCES = [
 ]

 EXPORTS.mozilla.dom += [
+    'OnlineSpeechRecognitionService.h',
    'SpeechGrammar.h',
    'SpeechGrammarList.h',
    'SpeechRecognition.h',
@ -21,6 +22,12 @@ EXPORTS.mozilla.dom += [
    'SpeechTrackListener.h',
 ]

+EXPORTS += [
+    'endpointer.h',
+    'energy_endpointer.h',
+    'energy_endpointer_params.h',
+]
+
 if CONFIG['MOZ_WEBSPEECH_TEST_BACKEND']:
    EXPORTS.mozilla.dom += [
        'test/FakeSpeechRecognitionService.h',
@ -30,6 +37,7 @@ UNIFIED_SOURCES += [
    'endpointer.cc',
    'energy_endpointer.cc',
    'energy_endpointer_params.cc',
+    'OnlineSpeechRecognitionService.cpp',
    'SpeechGrammar.cpp',
    'SpeechGrammarList.cpp',
    'SpeechRecognition.cpp',
@ -44,8 +52,13 @@ if CONFIG['MOZ_WEBSPEECH_TEST_BACKEND']:
        'test/FakeSpeechRecognitionService.cpp',
    ]

+USE_LIBS += [
+    'jsoncpp',
+]
+
 LOCAL_INCLUDES += [
    '/dom/base',
+    '/toolkit/components/jsoncpp/include',
 ]

 include('/ipc/chromium/chromium-config.mozbuild')
--- a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp
+++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp
@ -30,6 +30,7 @@ FakeSpeechRecognitionService::~FakeSpeechRecognitionService() = default;
 NS_IMETHODIMP
 FakeSpeechRecognitionService::Initialize(
    WeakPtr<SpeechRecognition> aSpeechRecognition) {
+  MOZ_ASSERT(NS_IsMainThread());
  mRecognition = aSpeechRecognition;
  nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
  obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
@ -40,11 +41,15 @@ FakeSpeechRecognitionService::Initialize(
 NS_IMETHODIMP
 FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
                                                  int32_t aSampleRate) {
+  MOZ_ASSERT(!NS_IsMainThread());
  return NS_OK;
 }

 NS_IMETHODIMP
-FakeSpeechRecognitionService::SoundEnd() { return NS_OK; }
+FakeSpeechRecognitionService::SoundEnd() {
+  MOZ_ASSERT(NS_IsMainThread());
+  return NS_OK;
+}

 NS_IMETHODIMP
 FakeSpeechRecognitionService::ValidateAndSetGrammarList(
@ -53,7 +58,10 @@ FakeSpeechRecognitionService::ValidateAndSetGrammarList(
 }

 NS_IMETHODIMP
-FakeSpeechRecognitionService::Abort() { return NS_OK; }
+FakeSpeechRecognitionService::Abort() {
+  MOZ_ASSERT(NS_IsMainThread());
+  return NS_OK;
+}

 NS_IMETHODIMP
 FakeSpeechRecognitionService::Observe(nsISupports* aSubject, const char* aTopic,
@ -85,7 +93,6 @@ FakeSpeechRecognitionService::Observe(nsISupports* aSubject, const char* aTopic,
    event->mRecognitionResultList = BuildMockResultList();
    NS_DispatchToMainThread(event);
  }
-
  return NS_OK;
 }

--- a/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h
+++ b/dom/media/webspeech/recognition/test/FakeSpeechRecognitionService.h
@ -22,7 +22,7 @@ namespace mozilla {
 class FakeSpeechRecognitionService : public nsISpeechRecognitionService,
                                     public nsIObserver {
 public:
-  NS_DECL_ISUPPORTS
+  NS_DECL_THREADSAFE_ISUPPORTS
  NS_DECL_NSISPEECHRECOGNITIONSERVICE
  NS_DECL_NSIOBSERVER

--- a/dom/media/webspeech/recognition/test/head.js
+++ b/dom/media/webspeech/recognition/test/head.js
@ -163,7 +163,16 @@ function performTest(options) {
  );

  SpecialPowers.pushPrefEnv({ set: prefs }, function() {
-    var sr = new SpeechRecognition();
+    var sr;
+    if (!options.webkit) {
+      sr = new SpeechRecognition();
+    } else {
+      sr = new webkitSpeechRecognition();
+      var grammar = new webkitSpeechGrammar();
+      var speechrecognitionlist = new webkitSpeechGrammarList();
+      speechrecognitionlist.addFromString("", 1);
+      sr.grammars = speechrecognitionlist;
+    }
    var em = new EventManager(sr);

    for (var eventName in options.expectedEvents) {
--- a/dom/media/webspeech/recognition/test/http_requesthandler.sjs
+++ b/dom/media/webspeech/recognition/test/http_requesthandler.sjs
@ -0,0 +1,77 @@
+const CC = Components.Constructor;
+
+// Context structure - we need to set this up properly to pass to setObjectState
+const ctx = {
+  QueryInterface: function(iid) {
+    if (iid.equals(Components.interfaces.nsISupports))
+      return this;
+    throw Components.results.NS_ERROR_NO_INTERFACE;
+  }
+};
+
+function setRequest(request) {
+  setObjectState(key, request);
+}
+function getRequest() {
+  let request;
+  getObjectState(v => { request = v });
+  return request;
+}
+
+function handleRequest(request, response) {
+  response.processAsync();
+  if (request.queryString == "save") {
+    // Get the context structure and finish the old request
+    getObjectState("context", function(obj) {
+      savedCtx = obj.wrappedJSObject;
+      request = savedCtx.request;
+
+      response.setHeader("Content-Type", "application/octet-stream", false);
+      response.setHeader("Access-Control-Allow-Origin", "*", false);
+      response.setHeader("Cache-Control", "no-cache", false);
+      response.setStatusLine(request.httpVersion, 200, "OK");
+
+      const input = request.bodyInputStream;
+      const output = response.bodyOutputStream;
+      let bodyAvail;
+      while ((bodyAvail = input.available()) > 0) {
+        output.writeFrom(input, bodyAvail);
+      }
+      response.finish();
+    });
+    return;
+  } else if (request.queryString == "malformedresult=1" || request.queryString == "emptyresult=1") {
+    jsonOK = request.queryString == "malformedresult=1" ? '{"status":"ok","dat' : '{"status":"ok","data":[]}'
+    response.setHeader("Content-Length", String(jsonOK.length), false);
+    response.setHeader("Content-Type", "application/json", false);
+    response.setHeader("Access-Control-Allow-Origin", "*", false);
+    response.setHeader("Cache-Control", "no-cache", false);
+    response.setStatusLine(request.httpVersion, 200, "OK");
+    response.write(jsonOK, jsonOK.length);
+    response.finish();
+  } else if (request.queryString == "hangup=1") {
+    response.finish();
+  } else if (request.queryString == "return400=1") {
+    jsonOK = "{'message':'Bad header:accept-language-stt'}";
+    response.setHeader("Content-Length", String(jsonOK.length), false);
+    response.setHeader("Content-Type", "application/json", false);
+    response.setHeader("Access-Control-Allow-Origin", "*", false);
+    response.setHeader("Cache-Control", "no-cache", false);
+    response.setStatusLine(request.httpVersion, 400, "Bad Request");
+    response.write(jsonOK, jsonOK.length);
+    response.finish();
+  }
+  else {
+    ctx.wrappedJSObject = ctx;
+    ctx.request = request;
+    setObjectState("context", ctx);
+    jsonOK = '{"status":"ok","data":[{"confidence":0.9085610,"text":"hello"}]}';
+    response.setHeader("Content-Length", String(jsonOK.length), false);
+    response.setHeader("Content-Type", "application/json", false);
+    response.setHeader("Access-Control-Allow-Origin", "*", false);
+    response.setHeader("Cache-Control", "no-cache", false);
+    response.setStatusLine(request.httpVersion, 200, "OK");
+    response.write(jsonOK, jsonOK.length);
+    response.finish();
+  }
+}
--- a/dom/media/webspeech/recognition/test/mochitest.ini
+++ b/dom/media/webspeech/recognition/test/mochitest.ini
@ -5,6 +5,9 @@ support-files =
  head.js
  hello.ogg
  hello.ogg^headers^
+  http_requesthandler.sjs
+  sinoid+hello.ogg
+  sinoid+hello.ogg^headers^
  silence.ogg
  silence.ogg^headers^
 [test_abort.html]
@ -16,6 +19,12 @@ tags=capturestream
 skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538363
 [test_nested_eventloop.html]
 skip-if = toolkit == 'android'
+[test_online_400_response.html]
+[test_online_hangup.html]
+[test_online_http.html]
+[test_online_http_webkit.html]
+[test_online_malformed_result_handling.html]
+[test_online_empty_result_handling.html]
 [test_preference_enable.html]
 [test_recognition_service_error.html]
 skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360
--- a/dom/media/webspeech/recognition/test/sinoid+hello.ogg
+++ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg
--- a/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^
+++ b/dom/media/webspeech/recognition/test/sinoid+hello.ogg^headers^
@ -0,0 +1 @@
+Cache-Control: no-store
--- a/dom/media/webspeech/recognition/test/test_abort.html
+++ b/dom/media/webspeech/recognition/test/test_abort.html
@ -60,7 +60,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
      eventsToRequest: [],
      expectedEvents,
      doneFunc: (nextEventIdx < eventsToAbortOn.length) ? doNextTest : SimpleTest.finish,
-      prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
+      prefs: [["media.webspeech.test.fake_fsm_events", true],
+              ["media.webspeech.test.fake_recognition_service", true],
+              ["media.webspeech.recognition.timeout", 100000]]
    });
  }

--- a/dom/media/webspeech/recognition/test/test_audio_capture_error.html
+++ b/dom/media/webspeech/recognition/test/test_audio_capture_error.html
@ -32,7 +32,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
      'end': null
    },
    doneFunc: SimpleTest.finish,
-    prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
+    prefs: [["media.webspeech.test.fake_fsm_events", true],
+            ["media.webspeech.test.fake_recognition_service", true],
+            ["media.webspeech.recognition.timeout", 100000]]
  });
 </script>
 </pre>
--- a/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html
+++ b/dom/media/webspeech/recognition/test/test_call_start_from_end_handler.html
@ -91,7 +91,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
      'result': buildResultCallback("Mock final result"),
      'end': endHandler,
    },
-    prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
+    prefs: [["media.webspeech.test.fake_fsm_events", true],
+            ["media.webspeech.test.fake_recognition_service", true],
+            ["media.webspeech.recognition.timeout", 100000]]
  });

 </script>
--- a/dom/media/webspeech/recognition/test/test_nested_eventloop.html
+++ b/dom/media/webspeech/recognition/test/test_nested_eventloop.html
@ -72,7 +72,8 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
    },
    doneFunc,
    prefs: [["media.webspeech.test.fake_fsm_events", true],
-            ["media.webspeech.test.fake_recognition_service", true]]
+            ["media.webspeech.test.fake_recognition_service", true],
+            ["media.webspeech.recognition.timeout", 100000]]
  });

 </script>
--- a/dom/media/webspeech/recognition/test/test_online_400_response.html
+++ b/dom/media/webspeech/recognition/test/test_online_400_response.html
@ -0,0 +1,47 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server returns a 400 error
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 1248897 -- Online speech service</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+  <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+  SimpleTest.waitForExplicitFinish();
+
+  performTest({
+    eventsToRequest: [],
+    expectedEvents: {
+      "start": null,
+      "audiostart": null,
+      "audioend": null,
+      "end": null,
+      'error': buildErrorCallback(errorCodes.NETWORK),
+      "speechstart": null,
+      "speechend": null
+    },
+    doneFunc: SimpleTest.finish,
+    prefs: [["media.webspeech.recognition.enable", true],
+            ["media.webspeech.recognition.force_enable", true],
+            ["media.webspeech.service.endpoint",
+              "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?return400=1"],
+            ["media.webspeech.recognition.timeout", 100000]]
+  });
+
+</script>
+</pre>
+</body>
+</html>
--- a/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html
+++ b/dom/media/webspeech/recognition/test/test_online_empty_result_handling.html
@ -0,0 +1,48 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server returns a valid json object, but without any transcription
+results on it, for example: `{"status":"ok","data":[]}`
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 1248897 -- Online speech service</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+  <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+  SimpleTest.waitForExplicitFinish();
+
+  performTest({
+    eventsToRequest: [],
+    expectedEvents: {
+      "start": null,
+      "audiostart": null,
+      "audioend": null,
+      "end": null,
+      'error': buildErrorCallback(errorCodes.NETWORK),
+      "speechstart": null,
+      "speechend": null
+    },
+    doneFunc: SimpleTest.finish,
+    prefs: [["media.webspeech.recognition.enable", true],
+            ["media.webspeech.recognition.force_enable", true],
+            ["media.webspeech.service.endpoint",
+              "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?emptyresult=1"],
+            ["media.webspeech.recognition.timeout", 100000]]
+  });
+
+</script>
+</pre>
+</body>
+</html>
--- a/dom/media/webspeech/recognition/test/test_online_hangup.html
+++ b/dom/media/webspeech/recognition/test/test_online_hangup.html
@ -0,0 +1,47 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server hangups the connection without sending any response
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 1248897 -- Online speech service</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+  <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+  SimpleTest.waitForExplicitFinish();
+
+  performTest({
+    eventsToRequest: [],
+    expectedEvents: {
+      "start": null,
+      "audiostart": null,
+      "audioend": null,
+      "end": null,
+      'error': buildErrorCallback(errorCodes.NETWORK),
+      "speechstart": null,
+      "speechend": null
+    },
+    doneFunc: SimpleTest.finish,
+    prefs: [["media.webspeech.recognition.enable", true],
+            ["media.webspeech.recognition.force_enable", true],
+            ["media.webspeech.service.endpoint",
+              "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?hangup=1"],
+            ["media.webspeech.recognition.timeout", 100000]]
+  });
+
+</script>
+</pre>
+</body>
+</html>
--- a/dom/media/webspeech/recognition/test/test_online_http.html
+++ b/dom/media/webspeech/recognition/test/test_online_http.html
@ -0,0 +1,89 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test a successfull speech recognition request and
+that audio is being properly encoded
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 1248897 -- Online speech service</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+  <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+  SimpleTest.waitForExplicitFinish();
+
+  async function validateRawAudio(buffer) {
+    const ac = new AudioContext();
+    const decodedData = await ac.decodeAudioData(buffer);
+    const source = ac.createBufferSource();
+    source.buffer = decodedData;
+    source.loop = true;
+    const analyser = ac.createAnalyser();
+    analyser.smoothingTimeConstant = 0.2;
+    analyser.fftSize = 1024;
+    source.connect(analyser);
+    const binIndexForFrequency = frequency =>
+      1 + Math.round(frequency * analyser.fftSize / ac.sampleRate);
+    source.start();
+    const data = new Uint8Array(analyser.frequencyBinCount);
+    const start = performance.now();
+    while (true) {
+      if (performance.now() - start > 10000) {
+        return false;
+        break;
+      }
+      analyser.getByteFrequencyData(data);
+      if (data[binIndexForFrequency(200)] < 50 &&
+          data[binIndexForFrequency(440)] > 180 &&
+          data[binIndexForFrequency(1000)] < 50) {
+        return true;
+        break;
+      }
+      await new Promise(r => requestAnimationFrame(r));
+    }
+  }
+
+  async function verifyEncodedAudio(requestUrl) {
+    try {
+      const response = await fetch(requestUrl);
+      const buffer = await response.arrayBuffer();
+      ok(await validateRawAudio(buffer), "Audio encoding is valid");
+    } catch(e) {
+      ok(false, e);
+    } finally {
+      SimpleTest.finish();
+    }
+  }
+
+  performTest({
+    eventsToRequest: {},
+    expectedEvents: {
+      "start": null,
+      "audiostart": null,
+      "audioend": null,
+      "end": null,
+      "result": () => verifyEncodedAudio("http_requesthandler.sjs?save"),
+      "speechstart": null,
+      "speechend": null
+    },
+    audioSampleFile: "sinoid+hello.ogg",
+    prefs: [["media.webspeech.recognition.enable", true],
+            ["media.webspeech.recognition.force_enable", true],
+            ["media.webspeech.service.endpoint",
+              "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"],
+            ["media.webspeech.recognition.timeout", 100000]]
+  });
+</script>
+</pre>
+</body>
+</html>
--- a/dom/media/webspeech/recognition/test/test_online_http_webkit.html
+++ b/dom/media/webspeech/recognition/test/test_online_http_webkit.html
@ -0,0 +1,90 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test a successfull speech recognition request and
+that audio is being properly encoded
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 1248897 -- Online speech service</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+  <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+  SimpleTest.waitForExplicitFinish();
+
+  async function validateRawAudio(buffer) {
+    const ac = new AudioContext();
+    const decodedData = await ac.decodeAudioData(buffer);
+    const source = ac.createBufferSource();
+    source.buffer = decodedData;
+    source.loop = true;
+    const analyser = ac.createAnalyser();
+    analyser.smoothingTimeConstant = 0.2;
+    analyser.fftSize = 1024;
+    source.connect(analyser);
+    const binIndexForFrequency = frequency =>
+      1 + Math.round(frequency * analyser.fftSize / ac.sampleRate);
+    source.start();
+    const data = new Uint8Array(analyser.frequencyBinCount);
+    const start = performance.now();
+    while (true) {
+      if (performance.now() - start > 10000) {
+        return false;
+        break;
+      }
+      analyser.getByteFrequencyData(data);
+      if (data[binIndexForFrequency(200)] < 50 &&
+          data[binIndexForFrequency(440)] > 180 &&
+          data[binIndexForFrequency(1000)] < 50) {
+        return true;
+        break;
+      }
+      await new Promise(r => requestAnimationFrame(r));
+    }
+  }
+
+  async function verifyEncodedAudio(requestUrl) {
+    try {
+      const response = await fetch(requestUrl);
+      const buffer = await response.arrayBuffer();
+      ok(await validateRawAudio(buffer), "Audio encoding is valid");
+    } catch(e) {
+      ok(false, e);
+    } finally {
+      SimpleTest.finish();
+    }
+  }
+
+  performTest({
+    eventsToRequest: {},
+    expectedEvents: {
+      "start": null,
+      "audiostart": null,
+      "audioend": null,
+      "end": null,
+      "result": () => verifyEncodedAudio("http_requesthandler.sjs?save"),
+      "speechstart": null,
+      "speechend": null
+    },
+    audioSampleFile: "sinoid+hello.ogg",
+    prefs: [["media.webspeech.recognition.enable", true],
+            ["media.webspeech.recognition.force_enable", true],
+            ["media.webspeech.service.endpoint",
+              "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"],
+            ["media.webspeech.recognition.timeout", 100000]],
+    webkit: true
+  });
+</script>
+</pre>
+</body>
+</html>
--- a/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html
+++ b/dom/media/webspeech/recognition/test/test_online_malformed_result_handling.html
@ -0,0 +1,48 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
+The intent of this file is to test the speech recognition service behavior
+whenever the server returns an invalid/corrupted json object, for example:
+`{"status":"ok","dat`
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 1248897 -- Online speech service</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+  <script type="application/javascript" src="head.js"></script>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+  SimpleTest.waitForExplicitFinish();
+
+  performTest({
+    eventsToRequest: [],
+    expectedEvents: {
+      "start": null,
+      "audiostart": null,
+      "audioend": null,
+      "end": null,
+      'error': buildErrorCallback(errorCodes.NETWORK),
+      "speechstart": null,
+      "speechend": null
+    },
+    doneFunc: SimpleTest.finish,
+    prefs: [["media.webspeech.recognition.enable", true],
+            ["media.webspeech.recognition.force_enable", true],
+            ["media.webspeech.service.endpoint",
+              "http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?malformedresult=1"],
+            ["media.webspeech.recognition.timeout", 100000]]
+  });
+
+</script>
+</pre>
+</body>
+</html>
--- a/dom/media/webspeech/recognition/test/test_recognition_service_error.html
+++ b/dom/media/webspeech/recognition/test/test_recognition_service_error.html
@ -34,7 +34,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
      'end': null
    },
    doneFunc: SimpleTest.finish,
-    prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
+    prefs: [["media.webspeech.test.fake_fsm_events", true],
+            ["media.webspeech.test.fake_recognition_service", true],
+            ["media.webspeech.recognition.timeout", 100000]]
  });

 </script>
--- a/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html
+++ b/dom/media/webspeech/recognition/test/test_success_without_recognition_service.html
@ -34,7 +34,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
      'end': null
    },
    doneFunc:SimpleTest.finish,
-    prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
+    prefs: [["media.webspeech.test.fake_fsm_events", true],
+            ["media.webspeech.test.fake_recognition_service", true],
+            ["media.webspeech.recognition.timeout", 100000]]
  });

 </script>
--- a/dom/media/webspeech/recognition/test/test_timeout.html
+++ b/dom/media/webspeech/recognition/test/test_timeout.html
@ -31,7 +31,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
    },
    doneFunc: SimpleTest.finish,
    audioSampleFile: "silence.ogg",
-    prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
+    prefs: [["media.webspeech.test.fake_fsm_events", true],
+            ["media.webspeech.test.fake_recognition_service", true],
+            ["media.webspeech.recognition.timeout", 1000]]
  });

 </script>
--- a/dom/webidl/SpeechGrammar.webidl
+++ b/dom/webidl/SpeechGrammar.webidl
@ -11,6 +11,7 @@
 */

 [Pref="media.webspeech.recognition.enable",
+ NamedConstructor=webkitSpeechGrammar,
 Func="SpeechRecognition::IsAuthorized",
 Exposed=Window]
 interface SpeechGrammar {
--- a/dom/webidl/SpeechGrammarList.webidl
+++ b/dom/webidl/SpeechGrammarList.webidl
@ -11,6 +11,7 @@
 */

 [Pref="media.webspeech.recognition.enable",
+ NamedConstructor=webkitSpeechGrammarList,
 Func="SpeechRecognition::IsAuthorized",
 Exposed=Window]
 interface SpeechGrammarList {
--- a/dom/webidl/SpeechRecognition.webidl
+++ b/dom/webidl/SpeechRecognition.webidl
@ -11,6 +11,7 @@
 */

 [Pref="media.webspeech.recognition.enable",
+ NamedConstructor=webkitSpeechRecognition,
 Func="SpeechRecognition::IsAuthorized",
 Exposed=Window]
 interface SpeechRecognition : EventTarget {
--- a/layout/build/components.conf
+++ b/layout/build/components.conf
@ -438,6 +438,12 @@ if defined('MOZ_WEBSPEECH'):
            'headers': ['mozilla/dom/nsSynthVoiceRegistry.h'],
            'constructor': 'mozilla::dom::nsSynthVoiceRegistry::GetInstanceForService',
        },
+        {
+            'cid': '{0ff5ce56-5b09-4db8-adc6-8266af95f864}',
+            'contract_ids': ['@mozilla.org/webspeech/service;1?name=online'],
+            'type': 'mozilla::OnlineSpeechRecognitionService',
+            'headers': ['mozilla/dom/OnlineSpeechRecognitionService.h'],
+        },
    ]

 if defined('MOZ_WEBSPEECH_TEST_BACKEND'):
--- a/layout/build/nsLayoutModule.cpp
+++ b/layout/build/nsLayoutModule.cpp
@ -47,6 +47,7 @@

 #ifdef MOZ_WEBSPEECH
 #  include "mozilla/dom/nsSynthVoiceRegistry.h"
+#  include "mozilla/dom/OnlineSpeechRecognitionService.h"
 #endif

 #include "mozilla/dom/PushNotifier.h"