mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-30 05:35:31 +00:00
181 lines
7.0 KiB
C++
181 lines
7.0 KiB
C++
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following disclaimer
|
|
// in the documentation and/or other materials provided with the
|
|
// distribution.
|
|
// * Neither the name of Google Inc. nor the names of its
|
|
// contributors may be used to endorse or promote products derived from
|
|
// this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
// The EnergyEndpointer class finds likely speech onset and offset points.
|
|
//
|
|
// The implementation described here is about the simplest possible.
|
|
// It is based on timings of threshold crossings for overall signal
|
|
// RMS. It is suitable for light weight applications.
|
|
//
|
|
// As written, the basic idea is that one specifies intervals that
|
|
// must be occupied by super- and sub-threshold energy levels, and
|
|
// defers decisions re onset and offset times until these
|
|
// specifications have been met. Three basic intervals are tested: an
|
|
// onset window, a speech-on window, and an offset window. We require
|
|
// super-threshold to exceed some mimimum total durations in the onset
|
|
// and speech-on windows before declaring the speech onset time, and
|
|
// we specify a required sub-threshold residency in the offset window
|
|
// before declaring speech offset. As the various residency requirements are
|
|
// met, the EnergyEndpointer instance assumes various states, and can return the
|
|
// ID of these states to the client (see EpStatus below).
|
|
//
|
|
// The levels of the speech and background noise are continuously updated. It is
|
|
// important that the background noise level be estimated initially for
|
|
// robustness in noisy conditions. The first frames are assumed to be background
|
|
// noise and a fast update rate is used for the noise level. The duration for
|
|
// fast update is controlled by the fast_update_dur_ paramter.
|
|
//
|
|
// If used in noisy conditions, the endpointer should be started and run in the
|
|
// EnvironmentEstimation mode, for at least 200ms, before switching to
|
|
// UserInputMode.
|
|
// Audio feedback contamination can appear in the input audio, if not cut
|
|
// out or handled by echo cancellation. Audio feedback can trigger a false
|
|
// accept. The false accepts can be ignored by setting
|
|
// ep_contamination_rejection_period.
|
|
|
|
#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
|
|
#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
|
|
|
|
#include <vector>
|
|
|
|
#include "nsAutoPtr.h"
|
|
|
|
#include "energy_endpointer_params.h"
|
|
|
|
namespace mozilla {
|
|
|
|
// Endpointer status codes
|
|
enum EpStatus {
|
|
EP_PRE_SPEECH = 10,
|
|
EP_POSSIBLE_ONSET,
|
|
EP_SPEECH_PRESENT,
|
|
EP_POSSIBLE_OFFSET,
|
|
EP_POST_SPEECH,
|
|
};
|
|
|
|
class EnergyEndpointer {
|
|
public:
|
|
// The default construction MUST be followed by Init(), before any
|
|
// other use can be made of the instance.
|
|
EnergyEndpointer();
|
|
virtual ~EnergyEndpointer();
|
|
|
|
void Init(const EnergyEndpointerParams& params);
|
|
|
|
// Start the endpointer. This should be called at the beginning of a session.
|
|
void StartSession();
|
|
|
|
// Stop the endpointer.
|
|
void EndSession();
|
|
|
|
// Start environment estimation. Audio will be used for environment estimation
|
|
// i.e. noise level estimation.
|
|
void SetEnvironmentEstimationMode();
|
|
|
|
// Start user input. This should be called when the user indicates start of
|
|
// input, e.g. by pressing a button.
|
|
void SetUserInputMode();
|
|
|
|
// Computes the next input frame and modifies EnergyEndpointer status as
|
|
// appropriate based on the computation.
|
|
void ProcessAudioFrame(int64_t time_us,
|
|
const int16_t* samples, int num_samples,
|
|
float* rms_out);
|
|
|
|
// Returns the current state of the EnergyEndpointer and the time
|
|
// corresponding to the most recently computed frame.
|
|
EpStatus Status(int64_t* status_time_us) const;
|
|
|
|
bool estimating_environment() const {
|
|
return estimating_environment_;
|
|
}
|
|
|
|
// Returns estimated noise level in dB.
|
|
float GetNoiseLevelDb() const;
|
|
|
|
private:
|
|
class HistoryRing;
|
|
|
|
// Resets the endpointer internal state. If reset_threshold is true, the
|
|
// state will be reset completely, including adaptive thresholds and the
|
|
// removal of all history information.
|
|
void Restart(bool reset_threshold);
|
|
|
|
// Update internal speech and noise levels.
|
|
void UpdateLevels(float rms);
|
|
|
|
// Returns the number of frames (or frame number) corresponding to
|
|
// the 'time' (in seconds).
|
|
int TimeToFrame(float time) const;
|
|
|
|
EpStatus status_; // The current state of this instance.
|
|
float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
|
|
int64_t endpointer_time_us_; // Time of the most recently received audio frame.
|
|
int64_t fast_update_frames_; // Number of frames for initial level adaptation.
|
|
int64_t frame_counter_; // Number of frames seen. Used for initial adaptation.
|
|
float max_window_dur_; // Largest search window size (seconds)
|
|
float sample_rate_; // Sampling rate.
|
|
|
|
// Ring buffers to hold the speech activity history.
|
|
nsAutoPtr<HistoryRing> history_;
|
|
|
|
// Configuration parameters.
|
|
EnergyEndpointerParams params_;
|
|
|
|
// RMS which must be exceeded to conclude frame is speech.
|
|
float decision_threshold_;
|
|
|
|
// Flag to indicate that audio should be used to estimate environment, prior
|
|
// to receiving user input.
|
|
bool estimating_environment_;
|
|
|
|
// Estimate of the background noise level. Used externally for UI feedback.
|
|
float noise_level_;
|
|
|
|
// An adaptive threshold used to update decision_threshold_ when appropriate.
|
|
float rms_adapt_;
|
|
|
|
// Start lag corresponds to the highest fundamental frequency.
|
|
int start_lag_;
|
|
|
|
// End lag corresponds to the lowest fundamental frequency.
|
|
int end_lag_;
|
|
|
|
// Time when mode switched from environment estimation to user input. This
|
|
// is used to time forced rejection of audio feedback contamination.
|
|
int64_t user_input_start_time_us_;
|
|
|
|
// prevent copy constructor and assignment
|
|
EnergyEndpointer(const EnergyEndpointer&);
|
|
void operator=(const EnergyEndpointer&);
|
|
};
|
|
|
|
} // namespace mozilla
|
|
|
|
#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
|