2013-03-27 21:08:17 +00:00
|
|
|
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
|
|
|
|
//
|
|
|
|
// Redistribution and use in source and binary forms, with or without
|
|
|
|
// modification, are permitted provided that the following conditions are
|
|
|
|
// met:
|
|
|
|
//
|
|
|
|
// * Redistributions of source code must retain the above copyright
|
|
|
|
// notice, this list of conditions and the following disclaimer.
|
|
|
|
// * Redistributions in binary form must reproduce the above
|
|
|
|
// copyright notice, this list of conditions and the following disclaimer
|
|
|
|
// in the documentation and/or other materials provided with the
|
|
|
|
// distribution.
|
|
|
|
// * Neither the name of Google Inc. nor the names of its
|
|
|
|
// contributors may be used to endorse or promote products derived from
|
|
|
|
// this software without specific prior written permission.
|
|
|
|
//
|
|
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
|
|
|
|
#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
|
|
|
|
|
|
|
|
#include "energy_endpointer.h"
|
|
|
|
|
|
|
|
namespace mozilla {
|
|
|
|
|
|
|
|
struct AudioChunk;
|
|
|
|
|
|
|
|
// A simple interface to the underlying energy-endpointer implementation, this
|
|
|
|
// class lets callers provide audio as being recorded and let them poll to find
|
|
|
|
// when the user has stopped speaking.
|
|
|
|
//
|
|
|
|
// There are two events that may trigger the end of speech:
|
|
|
|
//
|
|
|
|
// speechInputPossiblyComplete event:
|
|
|
|
//
|
|
|
|
// Signals that silence/noise has been detected for a *short* amount of
|
|
|
|
// time after some speech has been detected. It can be used for low latency
|
|
|
|
// UI feedback. To disable it, set it to a large amount.
|
|
|
|
//
|
|
|
|
// speechInputComplete event:
|
|
|
|
//
|
|
|
|
// This event is intended to signal end of input and to stop recording.
|
|
|
|
// The amount of time to wait after speech is set by
|
|
|
|
// speech_input_complete_silence_length_ and optionally two other
|
|
|
|
// parameters (see below).
|
|
|
|
// This time can be held constant, or can change as more speech is detected.
|
|
|
|
// In the latter case, the time changes after a set amount of time from the
|
|
|
|
// *beginning* of speech. This is motivated by the expectation that there
|
|
|
|
// will be two distinct types of inputs: short search queries and longer
|
|
|
|
// dictation style input.
|
|
|
|
//
|
|
|
|
// Three parameters are used to define the piecewise constant timeout function.
|
|
|
|
// The timeout length is speech_input_complete_silence_length until
|
|
|
|
// long_speech_length, when it changes to
|
|
|
|
// long_speech_input_complete_silence_length.
|
|
|
|
class Endpointer {
|
|
|
|
public:
|
|
|
|
explicit Endpointer(int sample_rate);
|
|
|
|
|
|
|
|
// Start the endpointer. This should be called at the beginning of a session.
|
|
|
|
void StartSession();
|
|
|
|
|
|
|
|
// Stop the endpointer.
|
|
|
|
void EndSession();
|
|
|
|
|
|
|
|
// Start environment estimation. Audio will be used for environment estimation
|
|
|
|
// i.e. noise level estimation.
|
|
|
|
void SetEnvironmentEstimationMode();
|
|
|
|
|
|
|
|
// Start user input. This should be called when the user indicates start of
|
|
|
|
// input, e.g. by pressing a button.
|
|
|
|
void SetUserInputMode();
|
|
|
|
|
|
|
|
// Process a segment of audio, which may be more than one frame.
|
|
|
|
// The status of the last frame will be returned.
|
|
|
|
EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
|
|
|
|
|
|
|
|
// Get the status of the endpointer.
|
|
|
|
EpStatus Status(int64_t *time_us);
|
|
|
|
|
2013-03-27 21:08:21 +00:00
|
|
|
// Get the expected frame size for audio chunks. Audio chunks are expected
|
|
|
|
// to contain a number of samples that is a multiple of this number, and extra
|
|
|
|
// samples will be dropped.
|
|
|
|
int32_t FrameSize() const {
|
|
|
|
return frame_size_;
|
|
|
|
}
|
|
|
|
|
2013-03-27 21:08:17 +00:00
|
|
|
// Returns true if the endpointer detected reasonable audio levels above
|
|
|
|
// background noise which could be user speech, false if not.
|
|
|
|
bool DidStartReceivingSpeech() const {
|
|
|
|
return speech_previously_detected_;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsEstimatingEnvironment() const {
|
|
|
|
return energy_endpointer_.estimating_environment();
|
|
|
|
}
|
|
|
|
|
|
|
|
void set_speech_input_complete_silence_length(int64_t time_us) {
|
|
|
|
speech_input_complete_silence_length_us_ = time_us;
|
|
|
|
}
|
|
|
|
|
|
|
|
void set_long_speech_input_complete_silence_length(int64_t time_us) {
|
|
|
|
long_speech_input_complete_silence_length_us_ = time_us;
|
|
|
|
}
|
|
|
|
|
|
|
|
void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
|
|
|
|
speech_input_possibly_complete_silence_length_us_ = time_us;
|
|
|
|
}
|
|
|
|
|
|
|
|
void set_long_speech_length(int64_t time_us) {
|
|
|
|
long_speech_length_us_ = time_us;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool speech_input_complete() const {
|
|
|
|
return speech_input_complete_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// RMS background noise level in dB.
|
|
|
|
float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Reset internal states. Helper method common to initial input utterance
|
|
|
|
// and following input utternaces.
|
|
|
|
void Reset();
|
|
|
|
|
|
|
|
// Minimum allowable length of speech input.
|
|
|
|
int64_t speech_input_minimum_length_us_;
|
|
|
|
|
|
|
|
// The speechInputPossiblyComplete event signals that silence/noise has been
|
|
|
|
// detected for a *short* amount of time after some speech has been detected.
|
|
|
|
// This proporty specifies the time period.
|
|
|
|
int64_t speech_input_possibly_complete_silence_length_us_;
|
|
|
|
|
|
|
|
// The speechInputComplete event signals that silence/noise has been
|
|
|
|
// detected for a *long* amount of time after some speech has been detected.
|
|
|
|
// This property specifies the time period.
|
|
|
|
int64_t speech_input_complete_silence_length_us_;
|
|
|
|
|
|
|
|
// Same as above, this specifies the required silence period after speech
|
|
|
|
// detection. This period is used instead of
|
|
|
|
// speech_input_complete_silence_length_ when the utterance is longer than
|
|
|
|
// long_speech_length_. This parameter is optional.
|
|
|
|
int64_t long_speech_input_complete_silence_length_us_;
|
|
|
|
|
|
|
|
// The period of time after which the endpointer should consider
|
|
|
|
// long_speech_input_complete_silence_length_ as a valid silence period
|
|
|
|
// instead of speech_input_complete_silence_length_. This parameter is
|
|
|
|
// optional.
|
|
|
|
int64_t long_speech_length_us_;
|
|
|
|
|
|
|
|
// First speech onset time, used in determination of speech complete timeout.
|
|
|
|
int64_t speech_start_time_us_;
|
|
|
|
|
|
|
|
// Most recent end time, used in determination of speech complete timeout.
|
|
|
|
int64_t speech_end_time_us_;
|
|
|
|
|
|
|
|
int64_t audio_frame_time_us_;
|
|
|
|
EpStatus old_ep_status_;
|
|
|
|
bool waiting_for_speech_possibly_complete_timeout_;
|
|
|
|
bool waiting_for_speech_complete_timeout_;
|
|
|
|
bool speech_previously_detected_;
|
|
|
|
bool speech_input_complete_;
|
|
|
|
EnergyEndpointer energy_endpointer_;
|
|
|
|
int sample_rate_;
|
|
|
|
int32_t frame_size_;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace mozilla
|
|
|
|
|
|
|
|
#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
|