include/snowboy-detect.h

// include/snowboy-detect.h

// Copyright 2016  KITT.AI (author: Guoguo Chen)

#ifndef SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
#define SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_

#include <memory>
#include <string>

namespace snowboy {

// Forward declaration.
struct WaveHeader;
class PipelineDetect;
class PipelineVad;

////////////////////////////////////////////////////////////////////////////////
//
// SnowboyDetect class interface.
//
////////////////////////////////////////////////////////////////////////////////
class SnowboyDetect {
 public:
  // Constructor that takes a resource file, and a list of hotword models which
  // are separated by comma. In the case that more than one hotword exist in the
  // provided models, RunDetection() will return the index of the hotword, if
  // the corresponding hotword is triggered.
  //
  // CAVEAT: a personal model only contain one hotword, but an universal model
  //         may contain multiple hotwords. It is your responsibility to figure
  //         out the index of the hotword. For example, if your model string is
  //         "foo.pmdl,bar.umdl", where foo.pmdl contains hotword x, bar.umdl
  //         has two hotwords y and z, the indices of different hotwords are as
  //         follows:
  //         x 1
  //         y 2
  //         z 3
  //
  // @param [in]  resource_filename   Filename of resource file.
  // @param [in]  model_str           A string of multiple hotword models,
  //                                  separated by comma.
  SnowboyDetect(const std::string& resource_filename,
                const std::string& model_str);

  // Resets the detection. This class handles voice activity detection (VAD)
  // internally. But if you have an external VAD, you should call Reset()
  // whenever you see segment end from your VAD.
  bool Reset();

  // Runs hotword detection. Supported audio format is WAVE (with linear PCM,
  // 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
  // See SampleRate(), NumChannels() and BitsPerSample() for the required
  // sampling rate, number of channels and bits per sample values. You are
  // supposed to provide a small chunk of data (e.g., 0.1 second) each time you
  // call RunDetection(). Larger chunk usually leads to longer delay, but less
  // CPU usage.
  //
  // Definition of return values:
  // -2: Silence.
  // -1: Error.
  //  0: No event.
  //  1: Hotword 1 triggered.
  //  2: Hotword 2 triggered.
  //  ...
  //
  // @param [in]  data               Small chunk of data to be detected. See
  //                                 above for the supported data format.
  // @param [in]  is_end             Set it to true if it is the end of a
  //                                 utterance or file.
  int RunDetection(const std::string& data, bool is_end = false);

  // Various versions of RunDetection() that take different format of audio. If
  // NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
  //
  //   d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
  //
  // where d1c1 means data point 1 of channel 1.
  //
  // @param [in]  data               Small chunk of data to be detected. See
  //                                 above for the supported data format.
  // @param [in]  array_length       Length of the data array.
  // @param [in]  is_end             Set it to true if it is the end of a
  //                                 utterance or file.
  int RunDetection(const float* const data,
                   const int array_length, bool is_end = false);
  int RunDetection(const int16_t* const data,
                   const int array_length, bool is_end = false);
  int RunDetection(const int32_t* const data,
                   const int array_length, bool is_end = false);

  // Sets the sensitivity string for the loaded hotwords. A <sensitivity_str> is
  // a list of floating numbers between 0 and 1, and separated by comma. For
  // example, if there are 3 loaded hotwords, your string should looks something
  // like this:
  //   0.4,0.5,0.8
  // Make sure you properly align the sensitivity value to the corresponding
  // hotword.
  void SetSensitivity(const std::string& sensitivity_str);

  // Similar to the sensitivity setting above. When set higher than the above
  // sensitivity, the algorithm automatically chooses between the normal
  // sensitivity set above and the higher sensitivity set here, to maximize the
  // performance. By default, it is not set, which means the algorithm will
  // stick with the sensitivity set above.
  void SetHighSensitivity(const std::string& high_sensitivity_str);

  // Returns the sensitivity string for the current hotwords.
  std::string GetSensitivity() const;

  // Applied a fixed gain to the input audio. In case you have a very weak
  // microphone, you can use this function to boost input audio level.
  void SetAudioGain(const float audio_gain);

  // Writes the models to the model filenames specified in <model_str> in the
  // constructor. This overwrites the original model with the latest parameter
  // setting. You are supposed to call this function if you have updated the
  // hotword sensitivities through SetSensitivity(), and you would like to store
  // those values in the model as the default value.
  void UpdateModel() const;

  // Returns the number of the loaded hotwords. This helps you to figure the
  // index of the hotwords.
  int NumHotwords() const;

  // If <apply_frontend> is true, then apply frontend audio processing;
  // otherwise turns the audio processing off. Frontend audio processing
  // includes algorithms such as automatic gain control (AGC), noise suppression
  // (NS) and so on. Generally adding frontend audio processing helps the
  // performance, but if the model is not trained with frontend audio
  // processing, it may decrease the performance. The general rule of thumb is:
  //   1. For personal models, set it to false.
  //   2. For universal models, follow the instruction of each published model
  void ApplyFrontend(const bool apply_frontend);

  // Returns the required sampling rate, number of channels and bits per sample
  // values for the audio data. You should use this information to set up your
  // audio capturing interface.
  int SampleRate() const;
  int NumChannels() const;
  int BitsPerSample() const;

  ~SnowboyDetect();

 private:
  std::unique_ptr<WaveHeader> wave_header_;
  std::unique_ptr<PipelineDetect> detect_pipeline_;
};

////////////////////////////////////////////////////////////////////////////////
//
// SnowboyVad class interface.
//
////////////////////////////////////////////////////////////////////////////////
class SnowboyVad {
 public:
  // Constructor that takes a resource file. It shares the same resource file
  // with SnowboyDetect.
  SnowboyVad(const std::string& resource_filename);

  // Resets the VAD.
  bool Reset();

  // Runs the VAD algorithm. Supported audio format is WAVE (with linear PCM,
  // 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
  // See SampleRate(), NumChannels() and BitsPerSample() for the required
  // sampling rate, number of channels and bits per sample values. You are
  // supposed to provide a small chunk of data (e.g., 0.1 second) each time you
  // call RunDetection(). Larger chunk usually leads to longer delay, but less
  // CPU usage.
  //
  // Definition of return values:
  // -2: Silence.
  // -1: Error.
  //  0: Non-silence.
  //
  // @param [in]  data               Small chunk of data to be detected. See
  //                                 above for the supported data format.
  // @param [in]  is_end             Set it to true if it is the end of a
  //                                 utterance or file.
  int RunVad(const std::string& data, bool is_end = false);

  // Various versions of RunVad() that take different format of audio. If
  // NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
  //
  //   d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
  //
  // where d1c1 means data point 1 of channel 1.
  //
  // @param [in]  data               Small chunk of data to be detected. See
  //                                 above for the supported data format.
  // @param [in]  array_length       Length of the data array.
  // @param [in]  is_end             Set it to true if it is the end of a
  //                                 utterance or file.
  int RunVad(const float* const data,
             const int array_length, bool is_end = false);
  int RunVad(const int16_t* const data,
             const int array_length, bool is_end = false);
  int RunVad(const int32_t* const data,
             const int array_length, bool is_end = false);

  // Applied a fixed gain to the input audio. In case you have a very weak
  // microphone, you can use this function to boost input audio level.
  void SetAudioGain(const float audio_gain);

  // If <apply_frontend> is true, then apply frontend audio processing;
  // otherwise turns the audio processing off.
  void ApplyFrontend(const bool apply_frontend);

  // Returns the required sampling rate, number of channels and bits per sample
  // values for the audio data. You should use this information to set up your
  // audio capturing interface.
  int SampleRate() const;
  int NumChannels() const;
  int BitsPerSample() const;

  ~SnowboyVad();

 private:
  std::unique_ptr<WaveHeader> wave_header_;
  std::unique_ptr<PipelineVad> vad_pipeline_;
};

}  // namespace snowboy

#endif  // SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_