Bug 1248897 - Introducing an online speech recognition service for Web Speech API r=smaug,pehrsons,padenot

This patch introduces a Speech Recognition Service which interfaces with Mozilla's remote STT endpoint which is currently being used by multiple services

Differential Revision: https://phabricator.services.mozilla.com/D26047

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Andre Natal 2019-10-21 20:58:57 +00:00
parent 1fd2626e6a
commit 20834f4fb9
32 changed files with 1412 additions and 142 deletions

View File

@ -0,0 +1,473 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsThreadUtils.h"
#include "nsXPCOMCIDInternal.h"
#include "OnlineSpeechRecognitionService.h"
#include "nsIFile.h"
#include "SpeechGrammar.h"
#include "SpeechRecognition.h"
#include "SpeechRecognitionAlternative.h"
#include "SpeechRecognitionResult.h"
#include "SpeechRecognitionResultList.h"
#include "nsIObserverService.h"
#include "mozilla/StaticPrefs_media.h"
#include "mozilla/Services.h"
#include "nsDirectoryServiceDefs.h"
#include "nsDirectoryServiceUtils.h"
#include "nsMemory.h"
#include "nsNetUtil.h"
#include "nsContentUtils.h"
#include "nsIPrincipal.h"
#include "nsIStreamListener.h"
#include "nsIUploadChannel2.h"
#include "mozilla/dom/ClientIPCTypes.h"
#include "nsStringStream.h"
#include "nsIOutputStream.h"
#include "nsStreamUtils.h"
#include "OpusTrackEncoder.h"
#include "OggWriter.h"
#include "nsIClassOfService.h"
#include <json/json.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace mozilla {
using namespace dom;
using namespace std;
#define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \
"media.webspeech.service.endpoint"
#define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/"
#define MAX_LISTENING_TIME_MS 10000
NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService,
nsIStreamListener)
NS_IMETHODIMP
OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) {
MOZ_ASSERT(NS_IsMainThread());
return NS_OK;
}
static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure,
const char* aFromRawSegment,
uint32_t aToOffset, uint32_t aCount,
uint32_t* aWriteCount) {
nsCString* buf = static_cast<nsCString*>(aClosure);
buf->Append(aFromRawSegment, aCount);
*aWriteCount = aCount;
return NS_OK;
}
NS_IMETHODIMP
OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest,
nsIInputStream* aInputStream,
uint64_t aOffset,
uint32_t aCount) {
MOZ_ASSERT(NS_IsMainThread());
nsresult rv;
uint32_t readCount;
rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount,
&readCount);
NS_ENSURE_SUCCESS(rv, rv);
return NS_OK;
}
NS_IMETHODIMP
OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest,
nsresult aStatusCode) {
MOZ_ASSERT(NS_IsMainThread());
auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); });
if (mAborted) {
return NS_OK;
}
bool success;
float confidence = 0;
Json::Value root;
Json::CharReaderBuilder builder;
bool parsingSuccessful;
nsAutoCString result;
nsAutoCString hypoValue;
nsAutoString errorMsg;
SpeechRecognitionErrorCode errorCode;
SR_LOG("STT Result: %s", mBuf.get());
if (NS_FAILED(aStatusCode)) {
success = false;
errorMsg.Assign(NS_LITERAL_STRING("Error connecting to the service."));
errorCode = SpeechRecognitionErrorCode::Network;
} else {
success = true;
UniquePtr<Json::CharReader> const reader(builder.newCharReader());
parsingSuccessful =
reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr);
if (!parsingSuccessful) {
// there's an internal server error
success = false;
errorMsg.Assign(NS_LITERAL_STRING("Internal server error"));
errorCode = SpeechRecognitionErrorCode::Network;
} else {
result.Assign(root.get("status", "error").asString().c_str());
if (result.EqualsLiteral("ok")) {
// ok, we have a result
if (!root["data"].empty()) {
hypoValue.Assign(root["data"][0].get("text", "").asString().c_str());
confidence = root["data"][0].get("confidence", "0").asFloat();
} else {
success = false;
errorMsg.Assign(NS_LITERAL_STRING("Error reading result data."));
errorCode = SpeechRecognitionErrorCode::Network;
}
} else {
success = false;
NS_ConvertUTF8toUTF16 error(root.get("message", "").asString().c_str());
errorMsg.Assign(error);
errorCode = SpeechRecognitionErrorCode::No_speech;
}
}
}
if (!success) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg);
} else {
// Declare javascript result events
RefPtr<SpeechEvent> event = new SpeechEvent(
mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
SpeechRecognitionResultList* resultList =
new SpeechRecognitionResultList(mRecognition);
SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
if (mRecognition->MaxAlternatives() > 0) {
SpeechRecognitionAlternative* alternative =
new SpeechRecognitionAlternative(mRecognition);
alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue);
alternative->mConfidence = confidence;
result->mItems.AppendElement(alternative);
}
resultList->mItems.AppendElement(result);
event->mRecognitionResultList = resultList;
NS_DispatchToMainThread(event);
}
return NS_OK;
}
OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default;
OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default;
NS_IMETHODIMP
OnlineSpeechRecognitionService::Initialize(
WeakPtr<SpeechRecognition> aSpeechRecognition) {
MOZ_ASSERT(NS_IsMainThread());
mWriter = MakeUnique<OggWriter>();
mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>(
"OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition);
mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding();
MOZ_ASSERT(mEncodeTaskQueue);
return NS_OK;
}
void OnlineSpeechRecognitionService::EncoderDataAvailable() {
MOZ_ASSERT(!NS_IsMainThread());
nsresult rv;
AutoTArray<RefPtr<EncodedFrame>, 4> container;
rv = mAudioEncoder->GetEncodedTrack(container);
if (NS_WARN_IF(NS_FAILED(rv))) {
MOZ_ASSERT_UNREACHABLE();
}
rv = mWriter->WriteEncodedTrack(
container,
mAudioEncoder->IsEncodingComplete() ? ContainerWriter::END_OF_STREAM : 0);
if (NS_WARN_IF(NS_FAILED(rv))) {
MOZ_ASSERT_UNREACHABLE();
}
mWriter->GetContainerData(&mEncodedData, mAudioEncoder->IsEncodingComplete()
? ContainerWriter::FLUSH_NEEDED
: 0);
if (mAudioEncoder->IsEncodingComplete()) {
NS_DispatchToMainThread(
NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this,
&OnlineSpeechRecognitionService::DoSTT));
}
}
void OnlineSpeechRecognitionService::EncoderInitialized() {
MOZ_ASSERT(!NS_IsMainThread());
AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata;
metadata.AppendElement(mAudioEncoder->GetMetadata());
if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) {
SR_LOG("wrong meta data type!");
MOZ_ASSERT_UNREACHABLE();
}
nsresult rv = mWriter->SetMetadata(metadata);
MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER);
MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
}
void OnlineSpeechRecognitionService::EncoderError() {
MOZ_ASSERT(!NS_IsMainThread());
SR_LOG("Error encoding frames.");
mEncodedData.Clear();
NS_DispatchToMainThread(NS_NewRunnableFunction(
"SpeechRecognition::DispatchError",
[this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
if (!mRecognition) {
return;
}
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Audio_capture,
NS_LITERAL_STRING("Encoder error"));
}));
}
NS_IMETHODIMP
OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
int32_t aSampleRate) {
MOZ_ASSERT(!NS_IsMainThread());
int64_t duration = aAudioSegment->GetDuration();
if (duration <= 0) {
return NS_OK;
}
if (!mAudioEncoder) {
mSpeechEncoderListener = new SpeechEncoderListener(this);
mAudioEncoder = MakeAndAddRef<OpusTrackEncoder>(aSampleRate);
RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent();
mAudioEncoder->SetWorkerThread(mEncoderThread);
mAudioEncoder->RegisterListener(mSpeechEncoderListener);
}
mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment));
TimeStamp now = TimeStamp::Now();
if (mFirstIteration.IsNull()) {
mFirstIteration = now;
}
if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) {
NS_DispatchToMainThread(NS_NewRunnableFunction(
"SpeechRecognition::Stop",
[this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
if (!mRecognition) {
return;
}
mRecognition->Stop();
}));
return NS_OK;
}
return NS_OK;
}
void OnlineSpeechRecognitionService::DoSTT() {
MOZ_ASSERT(NS_IsMainThread());
if (mAborted) {
return;
}
nsresult rv;
nsCOMPtr<nsIChannel> chan;
nsCOMPtr<nsIURI> uri;
nsAutoCString speechRecognitionEndpoint;
nsAutoCString prefEndpoint;
nsAutoString language;
Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT,
prefEndpoint);
if (!prefEndpoint.IsEmpty()) {
speechRecognitionEndpoint = prefEndpoint;
} else {
speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT;
}
rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr,
nullptr);
if (NS_WARN_IF(NS_FAILED(rv))) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Network, NS_LITERAL_STRING("Unknown URI"));
return;
}
nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_DATA_INHERITS;
nsLoadFlags loadFlags =
nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER;
nsContentPolicyType contentPolicy =
nsContentUtils::InternalContentPolicyTypeToExternal(
nsIContentPolicy::TYPE_OTHER);
nsPIDOMWindowInner* window = mRecognition->GetOwner();
if (NS_WARN_IF(!window)) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Aborted, NS_LITERAL_STRING("No window"));
return;
}
Document* doc = window->GetExtantDoc();
if (NS_WARN_IF(!doc)) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Aborted, NS_LITERAL_STRING("No document"));
}
rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags,
contentPolicy, nullptr, nullptr, nullptr, nullptr,
loadFlags);
if (NS_WARN_IF(NS_FAILED(rv))) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Network,
NS_LITERAL_STRING("Failed to open channel"));
return;
}
nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan);
if (httpChan) {
rv = httpChan->SetRequestMethod(NS_LITERAL_CSTRING("POST"));
MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
}
if (httpChan) {
mRecognition->GetLang(language);
// Accept-Language-STT is a custom header of our backend server used to set
// the language of the speech sample being submitted by the client
rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Accept-Language-STT"),
NS_ConvertUTF16toUTF8(language), false);
MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
// Tell the server to not store the transcription by default
rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Store-Transcription"),
NS_LITERAL_CSTRING("0"), false);
MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
// Tell the server to not store the sample by default
rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Store-Sample"),
NS_LITERAL_CSTRING("0"), false);
MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
// Set the product tag as teh web speech api
rv = httpChan->SetRequestHeader(NS_LITERAL_CSTRING("Product-Tag"),
NS_LITERAL_CSTRING("wsa"), false);
MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
}
nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan));
if (cos) {
cos->AddClassFlags(nsIClassOfService::UrgentStart);
}
nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan);
if (uploadChan) {
nsCOMPtr<nsIInputStream> bodyStream;
uint32_t length = 0;
for (const nsTArray<uint8_t>& chunk : mEncodedData) {
length += chunk.Length();
}
nsTArray<uint8_t> audio;
if (!audio.SetCapacity(length, fallible)) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Audio_capture,
NS_LITERAL_STRING("Allocation error"));
return;
}
for (const nsTArray<uint8_t>& chunk : mEncodedData) {
audio.AppendElements(chunk);
}
mEncodedData.Clear();
rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio));
if (NS_WARN_IF(NS_FAILED(rv))) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Network,
NS_LITERAL_STRING("Failed to open stream"));
return;
}
if (bodyStream) {
rv = uploadChan->ExplicitSetUploadStream(
bodyStream, NS_LITERAL_CSTRING("audio/ogg"), length,
NS_LITERAL_CSTRING("POST"), false);
MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
}
}
rv = chan->AsyncOpen(this);
if (NS_WARN_IF(NS_FAILED(rv))) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Network,
NS_LITERAL_STRING("Internal server error"));
}
}
NS_IMETHODIMP
OnlineSpeechRecognitionService::SoundEnd() {
MOZ_ASSERT(NS_IsMainThread());
if (!mEncodeTaskQueue) {
// Not initialized
return NS_OK;
}
nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction(
"OnlineSpeechRecognitionService::SoundEnd",
[this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
if (mAudioEncoder) {
mAudioEncoder->NotifyEndOfStream();
mAudioEncoder->UnregisterListener(mSpeechEncoderListener);
mSpeechEncoderListener = nullptr;
mAudioEncoder = nullptr;
}
}));
MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
Unused << rv;
mEncodeTaskQueue = nullptr;
return NS_OK;
}
NS_IMETHODIMP
OnlineSpeechRecognitionService::ValidateAndSetGrammarList(
SpeechGrammar* aSpeechGrammar,
nsISpeechGrammarCompilationCallback* aCallback) {
// This is an online LVCSR (STT) service,
// so we don't need to set a grammar
return NS_OK;
}
NS_IMETHODIMP
OnlineSpeechRecognitionService::Abort() {
MOZ_ASSERT(NS_IsMainThread());
if (mAborted) {
return NS_OK;
}
mAborted = true;
return SoundEnd();
}
} // namespace mozilla

View File

@ -0,0 +1,133 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef mozilla_dom_OnlineRecognitionService_h
#define mozilla_dom_OnlineRecognitionService_h
#include "nsCOMPtr.h"
#include "nsTArray.h"
#include "nsISpeechRecognitionService.h"
#include "speex/speex_resampler.h"
#include "nsIStreamListener.h"
#include "OpusTrackEncoder.h"
#include "ContainerWriter.h"
#define NS_ONLINE_SPEECH_RECOGNITION_SERVICE_CID \
{0x0ff5ce56, \
0x5b09, \
0x4db8, \
{0xad, 0xc6, 0x82, 0x66, 0xaf, 0x95, 0xf8, 0x64}};
namespace mozilla {
namespace ipc {
class PrincipalInfo;
} // namespace ipc
/**
* Online implementation of the nsISpeechRecognitionService interface
*/
class OnlineSpeechRecognitionService : public nsISpeechRecognitionService,
public nsIStreamListener {
public:
// Add XPCOM glue code
NS_DECL_THREADSAFE_ISUPPORTS
NS_DECL_NSISPEECHRECOGNITIONSERVICE
NS_DECL_NSIREQUESTOBSERVER
NS_DECL_NSISTREAMLISTENER
/**
* Listener responsible for handling the events raised by the TrackEncoder
*/
class SpeechEncoderListener : public TrackEncoderListener {
public:
explicit SpeechEncoderListener(OnlineSpeechRecognitionService* aService)
: mService(aService), mOwningThread(AbstractThread::GetCurrent()) {}
void Initialized(TrackEncoder* aEncoder) override {
MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
mService->EncoderInitialized();
}
void DataAvailable(TrackEncoder* aEncoder) override {
MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
mService->EncoderDataAvailable();
}
void Error(TrackEncoder* aEncoder) override {
MOZ_ASSERT(mOwningThread->IsCurrentThreadIn());
mService->EncoderError();
}
private:
const RefPtr<OnlineSpeechRecognitionService> mService;
const RefPtr<AbstractThread> mOwningThread;
};
/**
* Default constructs a OnlineSpeechRecognitionService
*/
OnlineSpeechRecognitionService();
/**
* Called by SpeechEncoderListener when the AudioTrackEncoder has been
* initialized.
*/
void EncoderInitialized();
/**
* Called by SpeechEncoderListener when the AudioTrackEncoder has encoded
* some data for us to pass along.
*/
void EncoderDataAvailable();
/**
* Called by SpeechEncoderListener when the AudioTrackEncoder has
* encountered an error.
*/
void EncoderError();
private:
/**
* Private destructor to prevent bypassing of reference counting
*/
virtual ~OnlineSpeechRecognitionService();
/** The associated SpeechRecognition */
nsMainThreadPtrHandle<dom::SpeechRecognition> mRecognition;
/**
* Builds a mock SpeechRecognitionResultList
*/
dom::SpeechRecognitionResultList* BuildMockResultList();
/**
* Method responsible for uploading the audio to the remote endpoint
*/
void DoSTT();
// Encoded and packaged ogg audio data
nsTArray<nsTArray<uint8_t>> mEncodedData;
// Member responsible for holding a reference to the TrackEncoderListener
RefPtr<SpeechEncoderListener> mSpeechEncoderListener;
// Encoder responsible for encoding the frames from pcm to opus which is the
// format supported by our backend
RefPtr<AudioTrackEncoder> mAudioEncoder;
// Object responsible for wrapping the opus frames into an ogg container
UniquePtr<ContainerWriter> mWriter;
// Member responsible for storing the json string returned by the endpoint
nsCString mBuf;
// Used to calculate a ceiling on the time spent listening.
TimeStamp mFirstIteration;
// flag responsible to control if the user choose to abort
bool mAborted = false;
// reference to the audio encoder queue
RefPtr<TaskQueue> mEncodeTaskQueue;
};
} // namespace mozilla
#endif

View File

@ -36,6 +36,11 @@ class SpeechGrammar final : public nsISupports, public nsWrapperCache {
static already_AddRefed<SpeechGrammar> Constructor(
const GlobalObject& aGlobal);
static already_AddRefed<SpeechGrammar> WebkitSpeechGrammar(
const GlobalObject& aGlobal, ErrorResult& aRv) {
return Constructor(aGlobal);
}
void GetSrc(nsString& aRetVal, ErrorResult& aRv) const;
void SetSrc(const nsAString& aArg, ErrorResult& aRv);

View File

@ -35,6 +35,11 @@ class SpeechGrammarList final : public nsISupports, public nsWrapperCache {
static already_AddRefed<SpeechGrammarList> Constructor(
const GlobalObject& aGlobal);
static already_AddRefed<SpeechGrammarList> WebkitSpeechGrammarList(
const GlobalObject& aGlobal, ErrorResult& aRv) {
return Constructor(aGlobal);
}
nsISupports* GetParentObject() const;
JSObject* WrapObject(JSContext* aCx,

View File

@ -19,7 +19,8 @@
#include "mozilla/Preferences.h"
#include "mozilla/Services.h"
#include "mozilla/StaticPrefs_media.h"
#include "mozilla/AbstractThread.h"
#include "VideoUtils.h"
#include "AudioSegment.h"
#include "MediaEnginePrefs.h"
#include "endpointer.h"
@ -46,17 +47,17 @@ namespace mozilla {
namespace dom {
#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
#define DEFAULT_RECOGNITION_SERVICE_PREFIX "pocketsphinx-"
#define DEFAULT_RECOGNITION_SERVICE "pocketsphinx-en-US"
#define DEFAULT_RECOGNITION_SERVICE "online"
#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \
"media.webspeech.long_silence_length"
#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \
"media.webspeech.long_speech_length"
#define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \
"media.webspeech.recognition.timeout"
static const uint32_t kSAMPLE_RATE = 16000;
static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
// number of frames corresponding to 300ms of audio to send to endpointer while
// it's in environment estimation mode
@ -70,19 +71,39 @@ LogModule* GetSpeechRecognitionLog() {
#define SR_LOG(...) \
MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
already_AddRefed<nsISpeechRecognitionService> GetSpeechRecognitionService(
const nsAString& aLang) {
namespace {
class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker {
public:
SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition,
const nsString& aName)
: media::ShutdownBlocker(aName), mRecognition(aRecognition) {}
NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override {
MOZ_ASSERT(NS_IsMainThread());
// AbortSilently will eventually clear the blocker.
mRecognition->Abort();
return NS_OK;
}
private:
const RefPtr<SpeechRecognition> mRecognition;
};
enum class ServiceCreationError {
ServiceNotFound,
};
Result<nsCOMPtr<nsISpeechRecognitionService>, ServiceCreationError>
CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow,
SpeechRecognition* aRecognition,
const nsAString& aLang) {
nsAutoCString speechRecognitionServiceCID;
nsAutoCString prefValue;
Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue);
nsAutoCString speechRecognitionService;
if (!aLang.IsEmpty()) {
speechRecognitionService =
NS_LITERAL_CSTRING(DEFAULT_RECOGNITION_SERVICE_PREFIX) +
NS_ConvertUTF16toUTF8(aLang);
} else if (!prefValue.IsEmpty()) {
if (!prefValue.IsEmpty()) {
speechRecognitionService = prefValue;
} else {
speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
@ -99,27 +120,15 @@ already_AddRefed<nsISpeechRecognitionService> GetSpeechRecognitionService(
nsresult rv;
nsCOMPtr<nsISpeechRecognitionService> recognitionService;
recognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
return recognitionService.forget();
}
class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker {
public:
explicit SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition)
: media::ShutdownBlocker(NS_LITERAL_STRING("SpeechRecognition shutdown")),
mRecognition(aRecognition) {}
NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override {
MOZ_ASSERT(NS_IsMainThread());
// AbortSilently will eventually clear the blocker.
mRecognition->Abort();
return NS_OK;
recognitionService =
do_CreateInstance(speechRecognitionServiceCID.get(), &rv);
if (!recognitionService) {
return Err(ServiceCreationError::ServiceNotFound);
}
private:
const RefPtr<SpeechRecognition> mRecognition;
};
return recognitionService;
}
} // namespace
NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition, DOMEventTargetHelper,
mStream, mTrack, mRecognitionService,
@ -137,7 +146,8 @@ SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
mEndpointer(kSAMPLE_RATE),
mAudioSamplesPerChunk(mEndpointer.FrameSize()),
mSpeechDetectionTimer(NS_NewTimer()),
mSpeechGrammarList(new SpeechGrammarList(GetParentObject())),
mSpeechGrammarList(new SpeechGrammarList(GetOwner())),
mContinuous(false),
mInterimResults(false),
mMaxAlternatives(1) {
SR_LOG("created SpeechRecognition");
@ -154,6 +164,10 @@ SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000));
mEndpointer.set_long_speech_length(
Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
mSpeechDetectionTimeoutMs =
Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000);
Reset();
}
@ -211,8 +225,6 @@ already_AddRefed<SpeechRecognition> SpeechRecognition::Constructor(
return object.forget();
}
nsISupports* SpeechRecognition::GetParentObject() const { return GetOwner(); }
void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) {
SR_LOG("Processing %s, current state is %s", GetName(aEvent),
GetName(mCurrentState));
@ -245,8 +257,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
case EVENT_RECOGNITIONSERVICE_ERROR:
AbortError(aEvent);
break;
case EVENT_COUNT:
MOZ_CRASH("Invalid event EVENT_COUNT");
default:
MOZ_CRASH("Invalid event");
}
break;
case STATE_STARTING:
@ -262,7 +274,7 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
AbortSilently(aEvent);
break;
case EVENT_STOP:
Reset();
ResetAndEnd();
break;
case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
@ -271,8 +283,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
case EVENT_START:
SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
MOZ_CRASH();
case EVENT_COUNT:
MOZ_CRASH("Invalid event EVENT_COUNT");
default:
MOZ_CRASH("Invalid event");
}
break;
case STATE_ESTIMATING:
@ -297,8 +309,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
case EVENT_START:
SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
MOZ_CRASH();
case EVENT_COUNT:
MOZ_CRASH("Invalid event EVENT_COUNT");
default:
MOZ_CRASH("Invalid event");
}
break;
case STATE_WAITING_FOR_SPEECH:
@ -323,8 +335,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
case EVENT_START:
SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
MOZ_CRASH();
case EVENT_COUNT:
MOZ_CRASH("Invalid event EVENT_COUNT");
default:
MOZ_CRASH("Invalid event");
}
break;
case STATE_RECOGNIZING:
@ -349,8 +361,8 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
case EVENT_START:
SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
MOZ_CRASH();
case EVENT_COUNT:
MOZ_CRASH("Invalid event EVENT_COUNT");
default:
MOZ_CRASH("Invalid event");
}
break;
case STATE_WAITING_FOR_RESULT:
@ -376,12 +388,30 @@ void SpeechRecognition::Transition(SpeechEvent* aEvent) {
SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s",
GetName(aEvent));
MOZ_CRASH();
case EVENT_COUNT:
MOZ_CRASH("Invalid event EVENT_COUNT");
default:
MOZ_CRASH("Invalid event");
}
break;
case STATE_COUNT:
MOZ_CRASH("Invalid state STATE_COUNT");
case STATE_ABORTING:
switch (aEvent->mType) {
case EVENT_STOP:
case EVENT_ABORT:
case EVENT_AUDIO_DATA:
case EVENT_AUDIO_ERROR:
case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
case EVENT_RECOGNITIONSERVICE_ERROR:
DoNothing(aEvent);
break;
case EVENT_START:
SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent));
MOZ_CRASH();
default:
MOZ_CRASH("Invalid event");
}
break;
default:
MOZ_CRASH("Invalid state");
}
}
@ -400,7 +430,17 @@ uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment,
iterator.Next();
}
mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate);
// we need to call the nsISpeechRecognitionService::ProcessAudioSegment
// in a separate thread so that any eventual encoding or pre-processing
// of the audio does not block the main thread
nsresult rv = mEncodeTaskQueue->Dispatch(
NewRunnableMethod<StoreCopyPassByPtr<AudioSegment>, TrackRate>(
"nsISpeechRecognitionService::ProcessAudioSegment",
mRecognitionService,
&nsISpeechRecognitionService::ProcessAudioSegment,
std::move(*aSegment), aTrackRate));
MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
Unused << rv;
return samples;
}
@ -421,7 +461,19 @@ uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment,
void SpeechRecognition::Reset() {
SetState(STATE_IDLE);
// This breaks potential ref-cycles.
mRecognitionService = nullptr;
++mStreamGeneration;
if (mStream) {
mStream->UnregisterTrackListener(this);
mStream = nullptr;
}
mTrack = nullptr;
mTrackIsOwned = false;
mStopRecordingPromise = nullptr;
mEncodeTaskQueue = nullptr;
mEstimationSamples = 0;
mBufferedSamples = 0;
mSpeechDetectionTimer->Cancel();
@ -454,7 +506,12 @@ void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) {
SetState(STATE_WAITING_FOR_RESULT);
MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
mRecognitionService->SoundEnd();
// This will run SoundEnd on the service just before StopRecording begins
// shutting the encode thread down.
mSpeechListener->mRemovedPromise->Then(
GetCurrentThreadSerialEventTarget(), __func__,
[service = mRecognitionService] { service->SoundEnd(); });
StopRecording();
}
@ -518,14 +575,23 @@ void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {}
void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) {
if (mRecognitionService) {
mRecognitionService->Abort();
if (mTrack) {
// This will run Abort on the service just before StopRecording begins
// shutting the encode thread down.
mSpeechListener->mRemovedPromise->Then(
GetCurrentThreadSerialEventTarget(), __func__,
[service = mRecognitionService] { service->Abort(); });
} else {
// Recording hasn't started yet. We can just call Abort().
mRecognitionService->Abort();
}
}
if (mTrack) {
StopRecording();
}
StopRecording()->Then(
GetCurrentThreadSerialEventTarget(), __func__,
[self = RefPtr<SpeechRecognition>(this), this] { ResetAndEnd(); });
ResetAndEnd();
SetState(STATE_ABORTING);
}
void SpeechRecognition::AbortError(SpeechEvent* aEvent) {
@ -544,54 +610,83 @@ void SpeechRecognition::NotifyError(SpeechEvent* aEvent) {
**************************************/
NS_IMETHODIMP
SpeechRecognition::StartRecording(RefPtr<AudioStreamTrack>& aTrack) {
// hold a reference so that the underlying track
// doesn't get Destroy()'ed
// hold a reference so that the underlying track doesn't get collected.
mTrack = aTrack;
MOZ_ASSERT(!mTrack->Ended());
if (NS_WARN_IF(mTrack->Ended())) {
return NS_ERROR_UNEXPECTED;
}
mSpeechListener = new SpeechTrackListener(this);
mTrack->AddListener(mSpeechListener);
mShutdownBlocker = MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this);
nsString blockerName;
blockerName.AppendPrintf("SpeechRecognition %p shutdown", this);
mShutdownBlocker =
MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this, blockerName);
RefPtr<nsIAsyncShutdownClient> shutdown = media::GetShutdownBarrier();
shutdown->AddBlocker(mShutdownBlocker, NS_LITERAL_STRING(__FILE__), __LINE__,
NS_LITERAL_STRING("SpeechRecognition shutdown"));
mEndpointer.StartSession();
return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs,
nsITimer::TYPE_ONE_SHOT);
}
NS_IMETHODIMP
SpeechRecognition::StopRecording() {
if (mShutdownBlocker) {
// Block shutdown until the speech track listener has been removed from the
// MTG, as it holds a reference to us, and we reference the world, which we
// don't want to leak.
mSpeechListener->mRemovedPromise->Then(
GetCurrentThreadSerialEventTarget(), __func__,
[blocker = std::move(mShutdownBlocker)] {
RefPtr<nsIAsyncShutdownClient> shutdown = media::GetShutdownBarrier();
nsresult rv = shutdown->RemoveBlocker(blocker);
MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
Unused << rv;
});
RefPtr<GenericNonExclusivePromise> SpeechRecognition::StopRecording() {
if (!mTrack) {
// Recording wasn't started, or has already been stopped.
if (mStream) {
// Ensure we don't start recording because a track became available
// before we get reset.
mStream->UnregisterTrackListener(this);
}
return GenericNonExclusivePromise::CreateAndResolve(true, __func__);
}
if (mStopRecordingPromise) {
return mStopRecordingPromise;
}
MOZ_ASSERT(!mShutdownBlocker);
mStream->UnregisterTrackListener(this);
mTrack->RemoveListener(mSpeechListener);
mStream = nullptr;
mSpeechListener = nullptr;
mTrack = nullptr;
if (mTrackIsOwned) {
mTrack->Stop();
}
mEndpointer.EndSession();
DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
return NS_OK;
// Block shutdown until the speech track listener has been removed from the
// MSG, as it holds a reference to us, and we reference the world, which we
// don't want to leak.
mStopRecordingPromise =
mSpeechListener->mRemovedPromise
->Then(
GetCurrentThreadSerialEventTarget(), __func__,
[self = RefPtr<SpeechRecognition>(this), this] {
SR_LOG("Shutting down encoding thread");
return mEncodeTaskQueue->BeginShutdown();
},
[] {
MOZ_CRASH("Unexpected rejection");
return ShutdownPromise::CreateAndResolve(false, __func__);
})
->Then(
GetCurrentThreadSerialEventTarget(), __func__,
[self = RefPtr<SpeechRecognition>(this), this] {
RefPtr<nsIAsyncShutdownClient> shutdown =
media::GetShutdownBarrier();
shutdown->RemoveBlocker(mShutdownBlocker);
mShutdownBlocker = nullptr;
MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE);
return GenericNonExclusivePromise::CreateAndResolve(true,
__func__);
},
[] {
MOZ_CRASH("Unexpected rejection");
return GenericNonExclusivePromise::CreateAndResolve(false,
__func__);
});
return mStopRecordingPromise;
}
NS_IMETHODIMP
@ -648,12 +743,11 @@ void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; }
void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; }
bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const {
aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
return false;
return mContinuous;
}
void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) {
aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
mContinuous = aArg;
}
bool SpeechRecognition::InterimResults() const { return mInterimResults; }
@ -690,6 +784,10 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
return;
}
mEncodeTaskQueue = MakeAndAddRef<TaskQueue>(
GetMediaThreadPool(MediaThreadType::WEBRTC_DECODER),
"WebSpeechEncoderThread");
nsresult rv;
rv = mRecognitionService->Initialize(this);
if (NS_WARN_IF(NS_FAILED(rv))) {
@ -701,6 +799,7 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
if (aStream.WasPassed()) {
mStream = &aStream.Value();
mTrackIsOwned = false;
mStream->RegisterTrackListener(this);
nsTArray<RefPtr<AudioStreamTrack>> tracks;
mStream->GetAudioTracks(tracks);
@ -711,24 +810,40 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
}
}
} else {
mTrackIsOwned = true;
AutoNoJSAPI nojsapi;
RefPtr<SpeechRecognition> self(this);
MediaManager::Get()
->GetUserMedia(GetOwner(), constraints, aCallerType)
->Then(
GetCurrentThreadSerialEventTarget(), __func__,
[this, self](RefPtr<DOMMediaStream>&& aStream) {
[this, self,
generation = mStreamGeneration](RefPtr<DOMMediaStream>&& aStream) {
nsTArray<RefPtr<AudioStreamTrack>> tracks;
aStream->GetAudioTracks(tracks);
if (mAborted || mCurrentState != STATE_STARTING ||
mStreamGeneration != generation) {
// We were probably aborted. Exit early.
for (const RefPtr<AudioStreamTrack>& track : tracks) {
track->Stop();
}
return;
}
mStream = std::move(aStream);
mStream->RegisterTrackListener(this);
nsTArray<RefPtr<AudioStreamTrack>> tracks;
mStream->GetAudioTracks(tracks);
for (const RefPtr<AudioStreamTrack>& track : tracks) {
if (!track->Ended()) {
NotifyTrackAdded(track);
}
}
},
[this, self](RefPtr<MediaMgrError>&& error) {
[this, self,
generation = mStreamGeneration](RefPtr<MediaMgrError>&& error) {
if (mAborted || mCurrentState != STATE_STARTING ||
mStreamGeneration != generation) {
// We were probably aborted. Exit early.
return;
}
SpeechRecognitionErrorCode errorCode;
if (error->mName == MediaMgrError::Name::NotAllowedError) {
@ -746,44 +861,47 @@ void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
}
bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) {
if (!GetOwner()) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
return false;
}
// See:
// https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang
nsAutoString lang;
if (!mLang.IsEmpty()) {
mRecognitionService = GetSpeechRecognitionService(mLang);
if (!mRecognitionService) {
lang = mLang;
} else {
nsCOMPtr<Document> document = GetOwner()->GetExtantDoc();
if (!document) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
return false;
}
nsCOMPtr<Element> element = document->GetRootElement();
if (!element) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
return false;
}
return true;
nsAutoString lang;
element->GetLang(lang);
}
nsCOMPtr<nsPIDOMWindowInner> window = GetOwner();
if (!window) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
return false;
}
nsCOMPtr<Document> document = window->GetExtantDoc();
if (!document) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
return false;
}
nsCOMPtr<Element> element = document->GetRootElement();
if (!element) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
return false;
}
nsAutoString lang;
element->GetLang(lang);
mRecognitionService = GetSpeechRecognitionService(lang);
if (!mRecognitionService) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
auto result = CreateSpeechRecognitionService(GetOwner(), this, lang);
if (result.isErr()) {
switch (result.unwrapErr()) {
case ServiceCreationError::ServiceNotFound:
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
break;
default:
MOZ_CRASH("Unknown error");
}
return false;
}
mRecognitionService = result.unwrap();
MOZ_DIAGNOSTIC_ASSERT(mRecognitionService);
return true;
}
@ -794,11 +912,6 @@ bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) {
}
uint32_t grammarListLength = mSpeechGrammarList->Length();
if (0 == grammarListLength) {
aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
return false;
}
for (uint32_t count = 0; count < grammarListLength; ++count) {
RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv);
if (aRv.Failed()) {
@ -825,6 +938,7 @@ void SpeechRecognition::Abort() {
}
mAborted = true;
RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
NS_DispatchToMainThread(event);
}
@ -874,14 +988,13 @@ void SpeechRecognition::DispatchError(EventType aErrorType,
uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
uint32_t aSampleCount) {
MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
MOZ_ASSERT(mAudioSamplesBuffer.get());
MOZ_ASSERT(mAudioSamplesBuffer);
int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
size_t samplesToCopy =
std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples);
memcpy(samplesBuffer + mBufferedSamples, aSamples,
samplesToCopy * sizeof(int16_t));
PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy);
mBufferedSamples += samplesToCopy;
return samplesToCopy;
@ -903,8 +1016,8 @@ uint32_t SpeechRecognition::SplitSamplesBuffer(
RefPtr<SharedBuffer> chunk =
SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
mAudioSamplesPerChunk * sizeof(int16_t));
PodCopy(static_cast<short*>(chunk->Data()), aSamplesBuffer + chunkStart,
mAudioSamplesPerChunk);
aResult.AppendElement(chunk.forget());
chunkStart += mAudioSamplesPerChunk;
@ -987,6 +1100,7 @@ const char* SpeechRecognition::GetName(FSMState aId) {
"STATE_IDLE", "STATE_STARTING",
"STATE_ESTIMATING", "STATE_WAITING_FOR_SPEECH",
"STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT",
"STATE_ABORTING",
};
MOZ_ASSERT(aId < STATE_COUNT);
@ -1009,6 +1123,11 @@ const char* SpeechRecognition::GetName(SpeechEvent* aEvent) {
return names[aEvent->mType];
}
TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const {
MOZ_ASSERT(NS_IsMainThread());
return mEncodeTaskQueue;
}
SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition,
SpeechRecognition::EventType aType)
: Runnable("dom::SpeechEvent"),

View File

@ -32,6 +32,10 @@
namespace mozilla {
namespace media {
class ShutdownBlocker;
}
namespace dom {
#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \
@ -40,7 +44,6 @@ namespace dom {
class GlobalObject;
class AudioStreamTrack;
class SpeechRecognitionShutdownBlocker;
class SpeechEvent;
class SpeechTrackListener;
@ -62,8 +65,6 @@ class SpeechRecognition final : public DOMEventTargetHelper,
NS_DECL_NSIOBSERVER
nsISupports* GetParentObject() const;
JSObject* WrapObject(JSContext* aCx,
JS::Handle<JSObject*> aGivenProto) override;
@ -72,6 +73,11 @@ class SpeechRecognition final : public DOMEventTargetHelper,
static already_AddRefed<SpeechRecognition> Constructor(
const GlobalObject& aGlobal, ErrorResult& aRv);
static already_AddRefed<SpeechRecognition> WebkitSpeechRecognition(
const GlobalObject& aGlobal, ErrorResult& aRv) {
return Constructor(aGlobal, aRv);
}
already_AddRefed<SpeechGrammarList> Grammars() const;
void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
@ -90,6 +96,8 @@ class SpeechRecognition final : public DOMEventTargetHelper,
uint32_t MaxAlternatives() const;
TaskQueue* GetTaskQueueForEncoding() const;
void SetMaxAlternatives(uint32_t aArg);
void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
@ -153,6 +161,7 @@ class SpeechRecognition final : public DOMEventTargetHelper,
STATE_WAITING_FOR_SPEECH,
STATE_RECOGNIZING,
STATE_WAITING_FOR_RESULT,
STATE_ABORTING,
STATE_COUNT
};
@ -163,7 +172,7 @@ class SpeechRecognition final : public DOMEventTargetHelper,
bool ValidateAndSetGrammarList(ErrorResult& aRv);
NS_IMETHOD StartRecording(RefPtr<AudioStreamTrack>& aDOMStream);
NS_IMETHOD StopRecording();
RefPtr<GenericNonExclusivePromise> StopRecording();
uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
void NotifyError(SpeechEvent* aEvent);
@ -186,9 +195,19 @@ class SpeechRecognition final : public DOMEventTargetHelper,
RefPtr<DOMMediaStream> mStream;
RefPtr<AudioStreamTrack> mTrack;
bool mTrackIsOwned = false;
RefPtr<GenericNonExclusivePromise> mStopRecordingPromise;
RefPtr<SpeechTrackListener> mSpeechListener;
RefPtr<SpeechRecognitionShutdownBlocker> mShutdownBlocker;
nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
RefPtr<media::ShutdownBlocker> mShutdownBlocker;
// TaskQueue responsible for pre-processing the samples by the service
// it runs in a separate thread from the main thread
RefPtr<TaskQueue> mEncodeTaskQueue;
// A generation ID of the MediaStream a started session is for, so that
// a gUM request that resolves after the session has stopped, and a new
// one has started, can exit early. Main thread only. Can wrap.
uint8_t mStreamGeneration = 0;
FSMState mCurrentState;
@ -197,6 +216,10 @@ class SpeechRecognition final : public DOMEventTargetHelper,
uint32_t mAudioSamplesPerChunk;
// maximum amount of seconds the engine will wait for voice
// until returning a 'no speech detected' error
uint32_t mSpeechDetectionTimeoutMs;
// buffer holds one chunk of mAudioSamplesPerChunk
// samples before feeding it to mEndpointer
RefPtr<SharedBuffer> mAudioSamplesBuffer;
@ -209,6 +232,10 @@ class SpeechRecognition final : public DOMEventTargetHelper,
RefPtr<SpeechGrammarList> mSpeechGrammarList;
// private flag used to hold if the user called the setContinuous() method
// of the API
bool mContinuous;
// WebSpeechAPI (http://bit.ly/1gIl7DC) states:
//
// 1. Default value MUST be false

View File

@ -12,6 +12,7 @@ XPIDL_SOURCES = [
]
EXPORTS.mozilla.dom += [
'OnlineSpeechRecognitionService.h',
'SpeechGrammar.h',
'SpeechGrammarList.h',
'SpeechRecognition.h',
@ -21,6 +22,12 @@ EXPORTS.mozilla.dom += [
'SpeechTrackListener.h',
]
EXPORTS += [
'endpointer.h',
'energy_endpointer.h',
'energy_endpointer_params.h',
]
if CONFIG['MOZ_WEBSPEECH_TEST_BACKEND']:
EXPORTS.mozilla.dom += [
'test/FakeSpeechRecognitionService.h',
@ -30,6 +37,7 @@ UNIFIED_SOURCES += [
'endpointer.cc',
'energy_endpointer.cc',
'energy_endpointer_params.cc',
'OnlineSpeechRecognitionService.cpp',
'SpeechGrammar.cpp',
'SpeechGrammarList.cpp',
'SpeechRecognition.cpp',
@ -44,8 +52,13 @@ if CONFIG['MOZ_WEBSPEECH_TEST_BACKEND']:
'test/FakeSpeechRecognitionService.cpp',
]
USE_LIBS += [
'jsoncpp',
]
LOCAL_INCLUDES += [
'/dom/base',
'/toolkit/components/jsoncpp/include',
]
include('/ipc/chromium/chromium-config.mozbuild')

View File

@ -30,6 +30,7 @@ FakeSpeechRecognitionService::~FakeSpeechRecognitionService() = default;
NS_IMETHODIMP
FakeSpeechRecognitionService::Initialize(
WeakPtr<SpeechRecognition> aSpeechRecognition) {
MOZ_ASSERT(NS_IsMainThread());
mRecognition = aSpeechRecognition;
nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
@ -40,11 +41,15 @@ FakeSpeechRecognitionService::Initialize(
NS_IMETHODIMP
FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
int32_t aSampleRate) {
MOZ_ASSERT(!NS_IsMainThread());
return NS_OK;
}
NS_IMETHODIMP
FakeSpeechRecognitionService::SoundEnd() { return NS_OK; }
FakeSpeechRecognitionService::SoundEnd() {
MOZ_ASSERT(NS_IsMainThread());
return NS_OK;
}
NS_IMETHODIMP
FakeSpeechRecognitionService::ValidateAndSetGrammarList(
@ -53,7 +58,10 @@ FakeSpeechRecognitionService::ValidateAndSetGrammarList(
}
NS_IMETHODIMP
FakeSpeechRecognitionService::Abort() { return NS_OK; }
FakeSpeechRecognitionService::Abort() {
MOZ_ASSERT(NS_IsMainThread());
return NS_OK;
}
NS_IMETHODIMP
FakeSpeechRecognitionService::Observe(nsISupports* aSubject, const char* aTopic,
@ -85,7 +93,6 @@ FakeSpeechRecognitionService::Observe(nsISupports* aSubject, const char* aTopic,
event->mRecognitionResultList = BuildMockResultList();
NS_DispatchToMainThread(event);
}
return NS_OK;
}

View File

@ -22,7 +22,7 @@ namespace mozilla {
class FakeSpeechRecognitionService : public nsISpeechRecognitionService,
public nsIObserver {
public:
NS_DECL_ISUPPORTS
NS_DECL_THREADSAFE_ISUPPORTS
NS_DECL_NSISPEECHRECOGNITIONSERVICE
NS_DECL_NSIOBSERVER

View File

@ -163,7 +163,16 @@ function performTest(options) {
);
SpecialPowers.pushPrefEnv({ set: prefs }, function() {
var sr = new SpeechRecognition();
var sr;
if (!options.webkit) {
sr = new SpeechRecognition();
} else {
sr = new webkitSpeechRecognition();
var grammar = new webkitSpeechGrammar();
var speechrecognitionlist = new webkitSpeechGrammarList();
speechrecognitionlist.addFromString("", 1);
sr.grammars = speechrecognitionlist;
}
var em = new EventManager(sr);
for (var eventName in options.expectedEvents) {

View File

@ -0,0 +1,77 @@
const CC = Components.Constructor;
// Context structure - we need to set this up properly to pass to setObjectState
const ctx = {
QueryInterface: function(iid) {
if (iid.equals(Components.interfaces.nsISupports))
return this;
throw Components.results.NS_ERROR_NO_INTERFACE;
}
};
function setRequest(request) {
setObjectState(key, request);
}
function getRequest() {
let request;
getObjectState(v => { request = v });
return request;
}
function handleRequest(request, response) {
response.processAsync();
if (request.queryString == "save") {
// Get the context structure and finish the old request
getObjectState("context", function(obj) {
savedCtx = obj.wrappedJSObject;
request = savedCtx.request;
response.setHeader("Content-Type", "application/octet-stream", false);
response.setHeader("Access-Control-Allow-Origin", "*", false);
response.setHeader("Cache-Control", "no-cache", false);
response.setStatusLine(request.httpVersion, 200, "OK");
const input = request.bodyInputStream;
const output = response.bodyOutputStream;
let bodyAvail;
while ((bodyAvail = input.available()) > 0) {
output.writeFrom(input, bodyAvail);
}
response.finish();
});
return;
} else if (request.queryString == "malformedresult=1" || request.queryString == "emptyresult=1") {
jsonOK = request.queryString == "malformedresult=1" ? '{"status":"ok","dat' : '{"status":"ok","data":[]}'
response.setHeader("Content-Length", String(jsonOK.length), false);
response.setHeader("Content-Type", "application/json", false);
response.setHeader("Access-Control-Allow-Origin", "*", false);
response.setHeader("Cache-Control", "no-cache", false);
response.setStatusLine(request.httpVersion, 200, "OK");
response.write(jsonOK, jsonOK.length);
response.finish();
} else if (request.queryString == "hangup=1") {
response.finish();
} else if (request.queryString == "return400=1") {
jsonOK = "{'message':'Bad header:accept-language-stt'}";
response.setHeader("Content-Length", String(jsonOK.length), false);
response.setHeader("Content-Type", "application/json", false);
response.setHeader("Access-Control-Allow-Origin", "*", false);
response.setHeader("Cache-Control", "no-cache", false);
response.setStatusLine(request.httpVersion, 400, "Bad Request");
response.write(jsonOK, jsonOK.length);
response.finish();
}
else {
ctx.wrappedJSObject = ctx;
ctx.request = request;
setObjectState("context", ctx);
jsonOK = '{"status":"ok","data":[{"confidence":0.9085610,"text":"hello"}]}';
response.setHeader("Content-Length", String(jsonOK.length), false);
response.setHeader("Content-Type", "application/json", false);
response.setHeader("Access-Control-Allow-Origin", "*", false);
response.setHeader("Cache-Control", "no-cache", false);
response.setStatusLine(request.httpVersion, 200, "OK");
response.write(jsonOK, jsonOK.length);
response.finish();
}
}

View File

@ -5,6 +5,9 @@ support-files =
head.js
hello.ogg
hello.ogg^headers^
http_requesthandler.sjs
sinoid+hello.ogg
sinoid+hello.ogg^headers^
silence.ogg
silence.ogg^headers^
[test_abort.html]
@ -16,6 +19,12 @@ tags=capturestream
skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538363
[test_nested_eventloop.html]
skip-if = toolkit == 'android'
[test_online_400_response.html]
[test_online_hangup.html]
[test_online_http.html]
[test_online_http_webkit.html]
[test_online_malformed_result_handling.html]
[test_online_empty_result_handling.html]
[test_preference_enable.html]
[test_recognition_service_error.html]
skip-if = (os == "win" && processor == "aarch64") # aarch64 due to 1538360

Binary file not shown.

View File

@ -0,0 +1 @@
Cache-Control: no-store

View File

@ -60,7 +60,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
eventsToRequest: [],
expectedEvents,
doneFunc: (nextEventIdx < eventsToAbortOn.length) ? doNextTest : SimpleTest.finish,
prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
prefs: [["media.webspeech.test.fake_fsm_events", true],
["media.webspeech.test.fake_recognition_service", true],
["media.webspeech.recognition.timeout", 100000]]
});
}

View File

@ -32,7 +32,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
'end': null
},
doneFunc: SimpleTest.finish,
prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
prefs: [["media.webspeech.test.fake_fsm_events", true],
["media.webspeech.test.fake_recognition_service", true],
["media.webspeech.recognition.timeout", 100000]]
});
</script>
</pre>

View File

@ -91,7 +91,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
'result': buildResultCallback("Mock final result"),
'end': endHandler,
},
prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
prefs: [["media.webspeech.test.fake_fsm_events", true],
["media.webspeech.test.fake_recognition_service", true],
["media.webspeech.recognition.timeout", 100000]]
});
</script>

View File

@ -72,7 +72,8 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
},
doneFunc,
prefs: [["media.webspeech.test.fake_fsm_events", true],
["media.webspeech.test.fake_recognition_service", true]]
["media.webspeech.test.fake_recognition_service", true],
["media.webspeech.recognition.timeout", 100000]]
});
</script>

View File

@ -0,0 +1,47 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
The intent of this file is to test the speech recognition service behavior
whenever the server returns a 400 error
-->
<head>
<meta charset="utf-8">
<title>Test for Bug 1248897 -- Online speech service</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
<script type="application/javascript" src="head.js"></script>
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<pre id="test">
<script type="text/javascript">
SimpleTest.waitForExplicitFinish();
performTest({
eventsToRequest: [],
expectedEvents: {
"start": null,
"audiostart": null,
"audioend": null,
"end": null,
'error': buildErrorCallback(errorCodes.NETWORK),
"speechstart": null,
"speechend": null
},
doneFunc: SimpleTest.finish,
prefs: [["media.webspeech.recognition.enable", true],
["media.webspeech.recognition.force_enable", true],
["media.webspeech.service.endpoint",
"http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?return400=1"],
["media.webspeech.recognition.timeout", 100000]]
});
</script>
</pre>
</body>
</html>

View File

@ -0,0 +1,48 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
The intent of this file is to test the speech recognition service behavior
whenever the server returns a valid json object, but without any transcription
results on it, for example: `{"status":"ok","data":[]}`
-->
<head>
<meta charset="utf-8">
<title>Test for Bug 1248897 -- Online speech service</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
<script type="application/javascript" src="head.js"></script>
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<pre id="test">
<script type="text/javascript">
SimpleTest.waitForExplicitFinish();
performTest({
eventsToRequest: [],
expectedEvents: {
"start": null,
"audiostart": null,
"audioend": null,
"end": null,
'error': buildErrorCallback(errorCodes.NETWORK),
"speechstart": null,
"speechend": null
},
doneFunc: SimpleTest.finish,
prefs: [["media.webspeech.recognition.enable", true],
["media.webspeech.recognition.force_enable", true],
["media.webspeech.service.endpoint",
"http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?emptyresult=1"],
["media.webspeech.recognition.timeout", 100000]]
});
</script>
</pre>
</body>
</html>

View File

@ -0,0 +1,47 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
The intent of this file is to test the speech recognition service behavior
whenever the server hangups the connection without sending any response
-->
<head>
<meta charset="utf-8">
<title>Test for Bug 1248897 -- Online speech service</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
<script type="application/javascript" src="head.js"></script>
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<pre id="test">
<script type="text/javascript">
SimpleTest.waitForExplicitFinish();
performTest({
eventsToRequest: [],
expectedEvents: {
"start": null,
"audiostart": null,
"audioend": null,
"end": null,
'error': buildErrorCallback(errorCodes.NETWORK),
"speechstart": null,
"speechend": null
},
doneFunc: SimpleTest.finish,
prefs: [["media.webspeech.recognition.enable", true],
["media.webspeech.recognition.force_enable", true],
["media.webspeech.service.endpoint",
"http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?hangup=1"],
["media.webspeech.recognition.timeout", 100000]]
});
</script>
</pre>
</body>
</html>

View File

@ -0,0 +1,89 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
The intent of this file is to test a successfull speech recognition request and
that audio is being properly encoded
-->
<head>
<meta charset="utf-8">
<title>Test for Bug 1248897 -- Online speech service</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
<script type="application/javascript" src="head.js"></script>
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<pre id="test">
<script type="text/javascript">
SimpleTest.waitForExplicitFinish();
async function validateRawAudio(buffer) {
const ac = new AudioContext();
const decodedData = await ac.decodeAudioData(buffer);
const source = ac.createBufferSource();
source.buffer = decodedData;
source.loop = true;
const analyser = ac.createAnalyser();
analyser.smoothingTimeConstant = 0.2;
analyser.fftSize = 1024;
source.connect(analyser);
const binIndexForFrequency = frequency =>
1 + Math.round(frequency * analyser.fftSize / ac.sampleRate);
source.start();
const data = new Uint8Array(analyser.frequencyBinCount);
const start = performance.now();
while (true) {
if (performance.now() - start > 10000) {
return false;
break;
}
analyser.getByteFrequencyData(data);
if (data[binIndexForFrequency(200)] < 50 &&
data[binIndexForFrequency(440)] > 180 &&
data[binIndexForFrequency(1000)] < 50) {
return true;
break;
}
await new Promise(r => requestAnimationFrame(r));
}
}
async function verifyEncodedAudio(requestUrl) {
try {
const response = await fetch(requestUrl);
const buffer = await response.arrayBuffer();
ok(await validateRawAudio(buffer), "Audio encoding is valid");
} catch(e) {
ok(false, e);
} finally {
SimpleTest.finish();
}
}
performTest({
eventsToRequest: {},
expectedEvents: {
"start": null,
"audiostart": null,
"audioend": null,
"end": null,
"result": () => verifyEncodedAudio("http_requesthandler.sjs?save"),
"speechstart": null,
"speechend": null
},
audioSampleFile: "sinoid+hello.ogg",
prefs: [["media.webspeech.recognition.enable", true],
["media.webspeech.recognition.force_enable", true],
["media.webspeech.service.endpoint",
"http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"],
["media.webspeech.recognition.timeout", 100000]]
});
</script>
</pre>
</body>
</html>

View File

@ -0,0 +1,90 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
The intent of this file is to test a successfull speech recognition request and
that audio is being properly encoded
-->
<head>
<meta charset="utf-8">
<title>Test for Bug 1248897 -- Online speech service</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
<script type="application/javascript" src="head.js"></script>
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<pre id="test">
<script type="text/javascript">
SimpleTest.waitForExplicitFinish();
async function validateRawAudio(buffer) {
const ac = new AudioContext();
const decodedData = await ac.decodeAudioData(buffer);
const source = ac.createBufferSource();
source.buffer = decodedData;
source.loop = true;
const analyser = ac.createAnalyser();
analyser.smoothingTimeConstant = 0.2;
analyser.fftSize = 1024;
source.connect(analyser);
const binIndexForFrequency = frequency =>
1 + Math.round(frequency * analyser.fftSize / ac.sampleRate);
source.start();
const data = new Uint8Array(analyser.frequencyBinCount);
const start = performance.now();
while (true) {
if (performance.now() - start > 10000) {
return false;
break;
}
analyser.getByteFrequencyData(data);
if (data[binIndexForFrequency(200)] < 50 &&
data[binIndexForFrequency(440)] > 180 &&
data[binIndexForFrequency(1000)] < 50) {
return true;
break;
}
await new Promise(r => requestAnimationFrame(r));
}
}
async function verifyEncodedAudio(requestUrl) {
try {
const response = await fetch(requestUrl);
const buffer = await response.arrayBuffer();
ok(await validateRawAudio(buffer), "Audio encoding is valid");
} catch(e) {
ok(false, e);
} finally {
SimpleTest.finish();
}
}
performTest({
eventsToRequest: {},
expectedEvents: {
"start": null,
"audiostart": null,
"audioend": null,
"end": null,
"result": () => verifyEncodedAudio("http_requesthandler.sjs?save"),
"speechstart": null,
"speechend": null
},
audioSampleFile: "sinoid+hello.ogg",
prefs: [["media.webspeech.recognition.enable", true],
["media.webspeech.recognition.force_enable", true],
["media.webspeech.service.endpoint",
"http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs"],
["media.webspeech.recognition.timeout", 100000]],
webkit: true
});
</script>
</pre>
</body>
</html>

View File

@ -0,0 +1,48 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=1248897
The intent of this file is to test the speech recognition service behavior
whenever the server returns an invalid/corrupted json object, for example:
`{"status":"ok","dat`
-->
<head>
<meta charset="utf-8">
<title>Test for Bug 1248897 -- Online speech service</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
<script type="application/javascript" src="head.js"></script>
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1248897">Mozilla Bug 1248897</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<pre id="test">
<script type="text/javascript">
SimpleTest.waitForExplicitFinish();
performTest({
eventsToRequest: [],
expectedEvents: {
"start": null,
"audiostart": null,
"audioend": null,
"end": null,
'error': buildErrorCallback(errorCodes.NETWORK),
"speechstart": null,
"speechend": null
},
doneFunc: SimpleTest.finish,
prefs: [["media.webspeech.recognition.enable", true],
["media.webspeech.recognition.force_enable", true],
["media.webspeech.service.endpoint",
"http://mochi.test:8888/tests/dom/media/webspeech/recognition/test/http_requesthandler.sjs?malformedresult=1"],
["media.webspeech.recognition.timeout", 100000]]
});
</script>
</pre>
</body>
</html>

View File

@ -34,7 +34,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
'end': null
},
doneFunc: SimpleTest.finish,
prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
prefs: [["media.webspeech.test.fake_fsm_events", true],
["media.webspeech.test.fake_recognition_service", true],
["media.webspeech.recognition.timeout", 100000]]
});
</script>

View File

@ -34,7 +34,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
'end': null
},
doneFunc:SimpleTest.finish,
prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
prefs: [["media.webspeech.test.fake_fsm_events", true],
["media.webspeech.test.fake_recognition_service", true],
["media.webspeech.recognition.timeout", 100000]]
});
</script>

View File

@ -31,7 +31,9 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=650295
},
doneFunc: SimpleTest.finish,
audioSampleFile: "silence.ogg",
prefs: [["media.webspeech.test.fake_fsm_events", true], ["media.webspeech.test.fake_recognition_service", true]]
prefs: [["media.webspeech.test.fake_fsm_events", true],
["media.webspeech.test.fake_recognition_service", true],
["media.webspeech.recognition.timeout", 1000]]
});
</script>

View File

@ -11,6 +11,7 @@
*/
[Pref="media.webspeech.recognition.enable",
NamedConstructor=webkitSpeechGrammar,
Func="SpeechRecognition::IsAuthorized",
Exposed=Window]
interface SpeechGrammar {

View File

@ -11,6 +11,7 @@
*/
[Pref="media.webspeech.recognition.enable",
NamedConstructor=webkitSpeechGrammarList,
Func="SpeechRecognition::IsAuthorized",
Exposed=Window]
interface SpeechGrammarList {

View File

@ -11,6 +11,7 @@
*/
[Pref="media.webspeech.recognition.enable",
NamedConstructor=webkitSpeechRecognition,
Func="SpeechRecognition::IsAuthorized",
Exposed=Window]
interface SpeechRecognition : EventTarget {

View File

@ -438,6 +438,12 @@ if defined('MOZ_WEBSPEECH'):
'headers': ['mozilla/dom/nsSynthVoiceRegistry.h'],
'constructor': 'mozilla::dom::nsSynthVoiceRegistry::GetInstanceForService',
},
{
'cid': '{0ff5ce56-5b09-4db8-adc6-8266af95f864}',
'contract_ids': ['@mozilla.org/webspeech/service;1?name=online'],
'type': 'mozilla::OnlineSpeechRecognitionService',
'headers': ['mozilla/dom/OnlineSpeechRecognitionService.h'],
},
]
if defined('MOZ_WEBSPEECH_TEST_BACKEND'):

View File

@ -47,6 +47,7 @@
#ifdef MOZ_WEBSPEECH
# include "mozilla/dom/nsSynthVoiceRegistry.h"
# include "mozilla/dom/OnlineSpeechRecognitionService.h"
#endif
#include "mozilla/dom/PushNotifier.h"