TTS: Finish implementing the Windows TTS manager

2025-02-24 13:13:58 +00:00 · 2019-07-16 22:09:05 -07:00 · 2019-07-16 22:09:05 -07:00 · 318c6d7ec6
commit 318c6d7ec6
parent d2d34a4eca
8 changed files with 270 additions and 35 deletions
--- a/backends/text-to-speech/linux/linux-text-to-speech.cpp
+++ b/backends/text-to-speech/linux/linux-text-to-speech.cpp
@ -230,4 +230,25 @@ void LinuxTextToSpeechManager::updateVoices() {

 }

+bool LinuxTextToSpeechManager::popState() {
+	if (_ttsState->_next == nullptr)
+		return true;
+
+	for (Common::TTSVoice *i = _ttsState->_availaibleVoices.begin(); i < _ttsState->_availaibleVoices.end(); i++) {
+		free(i->getData());
+	}
+
+	Common::TTSState *oldState = _ttsState;
+	_ttsState = _ttsState->_next;
+
+	delete oldState;
+
+	setLanguage(_ttsState->_language);
+	setPitch(_ttsState->_pitch);
+	setVolume(_ttsState->_volume);
+	setRate(_ttsState->_rate);
+	return false;
+}
+
+
 #endif
--- a/backends/text-to-speech/linux/linux-text-to-speech.h
+++ b/backends/text-to-speech/linux/linux-text-to-speech.h
@ -63,6 +63,8 @@ public:

 	virtual void setLanguage(Common::String language);

+	virtual bool popState();
+
 	void updateState(SpeechState state);

 private:
--- a/backends/text-to-speech/windows/windows-text-to-speech.cpp
+++ b/backends/text-to-speech/windows/windows-text-to-speech.cpp
@ -31,6 +31,7 @@
 #include <Servprov.h>
 #include <sapi.h>
 #include "backends/text-to-speech/windows/sphelper-scummvm.h"
+#include "backends/platform/sdl/win32/win32_wrapper.h"

 #include "backends/text-to-speech/windows/windows-text-to-speech.h"

@ -43,82 +44,307 @@

 ISpVoice *_voice;

+// We need this pointer to be able to stop speech immediately.
+ISpAudio *_audio;
+
 WindowsTextToSpeechManager::WindowsTextToSpeechManager()
 	: _speechState(BROKEN){
 	init();
 }

 void WindowsTextToSpeechManager::init() {
+	// init COM
 	if (FAILED(::CoInitialize(NULL)))
 		return;

+	// init voice
 	HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&_voice);
 	if (!SUCCEEDED(hr)) {
 		warning("Could not initialize TTS voice");
 		return;
 	}
-	updateVoices();
-	_speechState = READY;
+	setLanguage("en");
+
+	// init audio
+	CSpStreamFormat format;
+	format.AssignFormat(SPSF_11kHz8BitMono);
+	ISpObjectToken *pToken;
+	hr = SpGetDefaultTokenFromCategoryId(SPCAT_AUDIOOUT, &pToken);
+	if (FAILED(hr)) {
+		warning("Could not initialize TTS audio");
+		return;
+	}
+	pToken->CreateInstance(NULL, CLSCTX_ALL, IID_ISpAudio, (void **)&_audio);
+	_audio->SetFormat(format.FormatId(), format.WaveFormatExPtr());
+	_voice->SetOutput(_audio, FALSE);
+
+	if(_ttsState->_availaibleVoices.size() > 0)
+		_speechState = READY;
+	else
+		_speechState = NO_VOICE;
 }

 WindowsTextToSpeechManager::~WindowsTextToSpeechManager() {
+	freeVoices();
 	if (_voice)
 		_voice->Release();
 	::CoUninitialize();
 }

 bool WindowsTextToSpeechManager::say(Common::String str) {
-	return true;
+	if(_speechState == BROKEN || _speechState == NO_VOICE) {
+		warning("The tts cannot speak in this state");
+		return true;
+	}
+	if (isPaused()) {
+		resume();
+	}
+	_audio->SetState(SPAS_STOP, 0);
+	_audio->SetState(SPAS_RUN, 0);
+	// We have to set the pitch by prepending xml code at the start of the said string;
+	Common::String pitch= Common::String::format("<pitch absmiddle=\"%d\">", _ttsState->_pitch);
+	str.replace((uint32)0, 0, pitch);
+
+	WCHAR *strW = Win32::ansiToUnicode(str.c_str());
+	bool result = _voice->Speak(strW, SPF_ASYNC | SPF_PURGEBEFORESPEAK, NULL) != S_OK;
+	free(strW);
+	_speechState = SPEAKING;
+	return result;
 }

 bool WindowsTextToSpeechManager::stop() {
-	return true;
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return true;
+	if (isPaused())
+		resume();
+	_audio->SetState(SPAS_STOP, 0);
+	_audio->SetState(SPAS_RUN, 0);
+	_voice->Speak(NULL, SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0);
+	_speechState = READY;
+	return false;
 }

 bool WindowsTextToSpeechManager::pause() {
-	return true;
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return true;
+	if (isPaused())
+		return false;
+	_voice->Pause();
+	_speechState = PAUSED;
+	return false;
 }

 bool WindowsTextToSpeechManager::resume() {
-	return true;
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return true;
+	if (!isPaused())
+		return false;
+	_voice->Resume();
+	if (isSpeaking())
+		_speechState = SPEAKING;
+	else
+		_speechState = READY;
+	return false;
 }

 bool WindowsTextToSpeechManager::isSpeaking() {
-	return true;
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return false;
+	SPVOICESTATUS eventStatus;
+	_voice->GetStatus(&eventStatus, NULL);
+	return eventStatus.dwRunningState == SPRS_IS_SPEAKING;
 }

 bool WindowsTextToSpeechManager::isPaused() {
-	return true;
+	return _speechState == PAUSED;
 }

 bool WindowsTextToSpeechManager::isReady() {
-	return true;
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return false;
+	if (_speechState != PAUSED && !isSpeaking())
+		return true;
+	else
+		return false;
 }

 void WindowsTextToSpeechManager::setVoice(unsigned index) {
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return;
+	_voice->SetVoice((ISpObjectToken *) _ttsState->_availaibleVoices[index].getData());
 }

 void WindowsTextToSpeechManager::setRate(int rate) {
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return;
+	assert(rate >= -10 && rate <= 10);
+	_voice->SetRate(rate);
+	_ttsState->_rate = rate;
 }

 void WindowsTextToSpeechManager::setPitch(int pitch) {
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return;
+	_ttsState->_pitch = pitch;
 }

 void WindowsTextToSpeechManager::setVolume(unsigned volume) {
+	if(_speechState == BROKEN || _speechState == NO_VOICE)
+		return;
+	assert(volume <= 100);
+	_voice->SetVolume(volume);
+	_ttsState->_volume = volume;
 }

 int WindowsTextToSpeechManager::getVolume() {
-	return 0;
+	return _ttsState->_volume;
+}
+
+void WindowsTextToSpeechManager::freeVoices() {
+	for(Common::TTSVoice *i = _ttsState->_availaibleVoices.begin(); i < _ttsState->_availaibleVoices.end(); i++) {
+		ISpObjectToken *voiceData = (ISpObjectToken *)i->getData();
+		voiceData->Release();
+	}
+	_ttsState->_availaibleVoices.clear();
 }

 void WindowsTextToSpeechManager::setLanguage(Common::String language) {
+	if (language == "C")
+		language = "en";
+	_ttsState->_language = language;
+	updateVoices();
+	setVoice(0);
 }

-void WindowsTextToSpeechManager::createVoice(int typeNumber, Common::TTSVoice::Gender gender, char *description) {
+void WindowsTextToSpeechManager::createVoice(void *cpVoiceToken) {
+	ISpObjectToken *voiceToken = (ISpObjectToken *) cpVoiceToken;
+
+	// description
+	WCHAR *descW;
+	SpGetDescription(voiceToken, &descW);
+	char *buffer = Win32::unicodeToAnsi(descW);
+	Common::String desc = buffer;
+	free(buffer);
+
+	// voice attributes
+	HRESULT hr = S_OK;
+	ISpDataKey *key = nullptr;
+	hr = voiceToken->OpenKey(L"Attributes", &key);
+
+	if (FAILED(hr)) {
+		voiceToken->Release();
+		warning("Could not open attribute key for voice: %s", desc.c_str());
+		return;
+	}
+	LPWSTR data;
+
+	// language
+	hr = key->GetStringValue(L"Language", &data);
+	if (FAILED(hr)) {
+		voiceToken->Release();
+		warning("Could not get the language attribute for voice: %s", desc.c_str());
+		return;
+	}
+	buffer = Win32::unicodeToAnsi(data);
+	Common::String language = lcidToLocale(buffer);
+	free(buffer);
+	CoTaskMemFree(data);
+
+	// only get the voices for the current language
+	if (language != _ttsState->_language) {
+		voiceToken->Release();
+		return;
+	}
+
+	// gender
+	hr = key->GetStringValue(L"Gender", &data);
+	if (FAILED(hr)) {
+		voiceToken->Release();
+		warning("Could not get the gender attribute for voice: %s", desc.c_str());
+		return;
+	}
+	buffer = Win32::unicodeToAnsi(data);
+	Common::TTSVoice::Gender gender = !strcmp(buffer, "Male") ? Common::TTSVoice::MALE : Common::TTSVoice::FEMALE;
+	free(buffer);
+	CoTaskMemFree(data);
+
+	_ttsState->_availaibleVoices.push_back(Common::TTSVoice(gender, (void *) voiceToken, desc));
+}
+
+int strToInt(Common::String str) {
+	str.toUppercase();
+	int result = 0;
+	for(unsigned i = 0; i < str.size(); i++) {
+		if (str[i] < '0' || (str[i] > '9' && str[i] < 'A') || str[i] > 'F')
+			break;
+		int num = (str[i] <= '9') ? str[i] - '0' : str[i] - 55;
+		result = result * 16 + num;
+	}
+	return result;
+}
+
+Common::String WindowsTextToSpeechManager::lcidToLocale(Common::String lcid) {
+	LCID locale = strToInt(lcid);
+	int nchars = GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, NULL, 0);
+	wchar_t *languageCode = new wchar_t[nchars];
+	GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, languageCode, nchars);
+	char *resultTmp = Win32::unicodeToAnsi(languageCode);
+	Common::String result = resultTmp;
+	delete[] languageCode;
+	free(resultTmp);
+	return result;
 }

 void WindowsTextToSpeechManager::updateVoices() {
+	freeVoices();
+	HRESULT hr = S_OK;
+	ISpObjectToken *cpVoiceToken = nullptr;
+	IEnumSpObjectTokens *cpEnum = nullptr;
+	unsigned long ulCount = 0;

+	hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &cpEnum);
+	if (SUCCEEDED(hr)) {
+		hr = cpEnum->GetCount(&ulCount);
+	}
+	_voice->SetVolume(0);
+	while (SUCCEEDED(hr) && ulCount--) {
+		hr = cpEnum->Next(1, &cpVoiceToken, NULL);
+		_voice->SetVoice(cpVoiceToken);
+		if(SUCCEEDED(_voice->Speak(L"hi, this is test", SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0)))
+			createVoice(cpVoiceToken);
+		else
+			cpVoiceToken->Release();
+	}
+	_voice->SetVolume(_ttsState->_volume);
+	cpEnum->Release();
+
+	if(_ttsState->_availaibleVoices.size() == 0) {
+		_speechState = NO_VOICE;
+		warning("No voice is availaible");
+	} else if (_speechState == NO_VOICE)
+		_speechState = READY;
+}
+
+bool WindowsTextToSpeechManager::popState() {
+	if (_ttsState->_next == nullptr)
+		return true;
+
+	for (Common::TTSVoice *i = _ttsState->_availaibleVoices.begin(); i < _ttsState->_availaibleVoices.end(); i++) {
+		ISpObjectToken *voiceToken = (ISpObjectToken *) i->getData();
+		voiceToken->Release();
+	}
+
+	Common::TTSState *oldState = _ttsState;
+	_ttsState = _ttsState->_next;
+
+	delete oldState;
+
+	setLanguage(_ttsState->_language);
+	setPitch(_ttsState->_pitch);
+	setVolume(_ttsState->_volume);
+	setRate(_ttsState->_rate);
+	setVoice(_ttsState->_activeVoice);
+	return false;
 }

 #endif
--- a/backends/text-to-speech/windows/windows-text-to-speech.h
+++ b/backends/text-to-speech/windows/windows-text-to-speech.h
@ -36,7 +36,8 @@ public:
 		READY,
 		PAUSED,
 		SPEAKING,
-		BROKEN
+		BROKEN,
+		NO_VOICE
 	};

 	WindowsTextToSpeechManager();
@ -63,10 +64,14 @@ public:

 	virtual void setLanguage(Common::String language);

+	virtual bool popState();
+
 private:
 	void init();
 	virtual void updateVoices();
-	void createVoice(int typeNumber, Common::TTSVoice::Gender, char *description);
+	void createVoice(void *cpVoiceToken);
+	void freeVoices();
+	Common::String lcidToLocale(Common::String lcid);
 	SpeechState _speechState;
 };

--- a/common/text-to-speech.cpp
+++ b/common/text-to-speech.cpp
@ -59,25 +59,5 @@ void TextToSpeechManager::pushState() {
 	updateVoices();
 }

-bool TextToSpeechManager::popState() {
-	if (_ttsState->_next == nullptr)
-		return true;
-
-	for (TTSVoice *i = _ttsState->_availaibleVoices.begin(); i < _ttsState->_availaibleVoices.end(); i++) {
-		free(i->_data);
-	}
-
-	TTSState *oldState = _ttsState;
-	_ttsState = _ttsState->_next;
-
-	delete oldState;
-
-	setLanguage(_ttsState->_language);
-	setPitch(_ttsState->_pitch);
-	setVolume(_ttsState->_volume);
-	setRate(_ttsState->_rate);
-	return false;
-}
-
 }
 #endif
--- a/common/text-to-speech.h
+++ b/common/text-to-speech.h
@ -109,7 +109,7 @@ public:
 	Array<TTSVoice> getVoicesArray() { return _ttsState->_availaibleVoices; }

 	void pushState();
-	bool popState();
+	virtual bool popState() { return true; }

 protected:
 	TTSState *_ttsState;
--- a/gui/options.cpp
+++ b/gui/options.cpp
@ -2161,6 +2161,7 @@ void GlobalOptionsDialog::apply() {
 			guiLang.setChar('\0', 2);
 			ttsMan->setLanguage(guiLang);
 		}
+		_ttsVoiceSelectionPopUp->setSelectedTag(0);
 	}
 	int volume = (ConfMan.getInt("speech_volume", "scummvm") * 100) / 256;
 	if (ConfMan.hasKey("mute", "scummvm") && ConfMan.getBool("mute", "scummvm"))
--- a/gui/widgets/popup.h
+++ b/gui/widgets/popup.h
@ -77,7 +77,7 @@ public:
 	uint32 getSelectedTag() const				{ return (_selectedItem >= 0) ? _entries[_selectedItem].tag : (uint32)-1; }
 //	const String& getSelectedString() const		{ return (_selectedItem >= 0) ? _entries[_selectedItem].name : String::emptyString; }

-	void handleMouseEntered(int button)	{ read(_entries[_selectedItem].name); setFlags(WIDGET_HILITED); markAsDirty(); }
+	void handleMouseEntered(int button)	{ if (_selectedItem != -1) read(_entries[_selectedItem].name); setFlags(WIDGET_HILITED); markAsDirty(); }
 	void handleMouseLeft(int button)	{ clearFlags(WIDGET_HILITED); markAsDirty(); }

 	virtual void reflowLayout();