44#ifndef SHERPA_ONNX_C_API_CXX_API_H_
45#define SHERPA_ONNX_C_API_CXX_API_H_
49#include <unordered_map>
289template <
typename Derived,
typename T>
307 if (&other ==
this) {
313 p_ = other.Release();
319 const T *
Get()
const {
return p_; }
334 static_cast<Derived *
>(
this)->Destroy(p_);
340 const T *p_ =
nullptr;
344 :
public MoveOnly<OnlineStream, SherpaOnnxOnlineStream> {
357 void SetOption(
const char *key,
const char *value)
const;
385 :
public MoveOnly<OnlineRecognizer, SherpaOnnxOnlineRecognizer> {
455 std::string
task =
"transcribe";
774 :
public MoveOnly<OfflineStream, SherpaOnnxOfflineStream> {
784 void SetOption(
const char *key,
const char *value)
const;
801 :
public MoveOnly<OfflineRecognizer, SherpaOnnxOfflineRecognizer> {
1033 std::unordered_map<std::string, std::string>
extra;
1064 int32_t num_samples,
float progress,
1091 :
public MoveOnly<OfflineTts, SherpaOnnxOfflineTts> {
1114 void *arg =
nullptr)
const;
1120 void *arg =
nullptr)
const;
1124 const std::string &text, int32_t sid = 0,
float speed = 1.0,
1177 :
public MoveOnly<KeywordSpotter, SherpaOnnxKeywordSpotter> {
1258 :
public MoveOnly<OfflineSpeechDenoiser, SherpaOnnxOfflineSpeechDenoiser> {
1285 :
public MoveOnly<OnlineSpeechDenoiser, SherpaOnnxOnlineSpeechDenoiser> {
1381 :
public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> {
1390 void Push(
const float *p, int32_t n)
const;
1393 std::vector<float>
Get(int32_t start_index, int32_t n)
const;
1418 :
public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> {
1422 float buffer_size_in_seconds);
1460 :
public MoveOnly<LinearResampler, SherpaOnnxLinearResampler> {
1466 int32_t samp_rate_out_hz,
1467 float filter_cutoff_hz, int32_t num_zeros);
1476 std::vector<float>
Resample(
const float *input, int32_t input_dim,
1520 :
public MoveOnly<OfflinePunctuation, SherpaOnnxOfflinePunctuation> {
1560 :
public MoveOnly<OnlinePunctuation, SherpaOnnxOnlinePunctuation> {
1625 :
public MoveOnly<AudioTagging, SherpaOnnxAudioTagging> {
1646 int32_t top_k = -1);
1703 std::vector<SourceSeparationStem>
stems;
1710 :
public MoveOnly<OfflineSourceSeparation,
1711 SherpaOnnxOfflineSourceSeparation> {
1730 int32_t num_channels, int32_t num_samples,
1731 int32_t sample_rate)
const;
Public C API for sherpa-onnx.
struct SherpaOnnxOfflineSpeechDenoiser SherpaOnnxOfflineSpeechDenoiser
Opaque offline speech denoiser handle.
struct SherpaOnnxOfflineStream SherpaOnnxOfflineStream
Non-streaming decoding state for one utterance.
struct SherpaOnnxOnlineSpeechDenoiser SherpaOnnxOnlineSpeechDenoiser
Opaque online speech denoiser handle.
struct SherpaOnnxOnlineStream SherpaOnnxOnlineStream
Streaming decoding state for one utterance or stream.
struct SherpaOnnxOfflineRecognizer SherpaOnnxOfflineRecognizer
Non-streaming recognizer handle.
struct SherpaOnnxKeywordSpotter SherpaOnnxKeywordSpotter
Opaque keyword spotter handle.
struct SherpaOnnxOnlinePunctuation SherpaOnnxOnlinePunctuation
Opaque online punctuation handle.
struct SherpaOnnxOnlineRecognizer SherpaOnnxOnlineRecognizer
Streaming recognizer handle.
struct SherpaOnnxCircularBuffer SherpaOnnxCircularBuffer
Opaque circular-buffer handle used by helper APIs.
struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts
Opaque offline TTS handle.
struct SherpaOnnxLinearResampler SherpaOnnxLinearResampler
Opaque linear resampler handle.
struct SherpaOnnxOfflineSourceSeparation SherpaOnnxOfflineSourceSeparation
Opaque source-separation engine handle.
struct SherpaOnnxAudioTagging SherpaOnnxAudioTagging
Opaque audio tagger handle.
struct SherpaOnnxOfflinePunctuation SherpaOnnxOfflinePunctuation
Opaque offline punctuation handle.
struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector
Opaque voice activity detector handle.
RAII wrapper for audio tagging.
std::vector< AudioEvent > Compute(const OfflineStream *s, int32_t top_k=-1)
Run audio tagging and return copied results.
OfflineStream CreateStream() const
Create an offline stream for tagging.
std::shared_ptr< std::vector< AudioEvent > > ComputePtr(const OfflineStream *s, int32_t top_k=-1)
Like Compute(), but returns the result vector in a shared pointer.
void Destroy(const SherpaOnnxAudioTagging *p) const
Destroy the wrapped C handle.
static AudioTagging Create(const AudioTaggingConfig &config)
Create an audio tagger.
RAII wrapper for the circular buffer helper used by VAD.
static CircularBuffer Create(int32_t capacity)
Create a circular buffer with the given capacity in samples.
int32_t Head() const
Return the current head index.
void Destroy(const SherpaOnnxCircularBuffer *p) const
Destroy the wrapped C handle.
std::vector< float > Get(int32_t start_index, int32_t n) const
Copy a contiguous span from the buffer.
void Push(const float *p, int32_t n) const
Append samples to the buffer.
int32_t Size() const
Return the number of stored samples.
void Reset() const
Reset the buffer to empty.
void Pop(int32_t n) const
Remove samples from the head of the buffer.
RAII wrapper for keyword spotting.
OnlineStream CreateStream(const std::string &keywords) const
Create a keyword stream with inline extra or replacement keywords.
bool IsReady(const OnlineStream *s) const
Check whether the stream has enough data to decode.
void Decode(const OnlineStream *ss, int32_t n) const
Decode multiple ready streams in parallel.
void Destroy(const SherpaOnnxKeywordSpotter *p) const
Destroy the wrapped C handle.
OnlineStream CreateStream() const
Create a keyword stream using configured keywords.
void Decode(const OnlineStream *s) const
Decode one ready stream.
static KeywordSpotter Create(const KeywordSpotterConfig &config)
Create a keyword spotter from a config struct.
void Reset(const OnlineStream *s) const
Reset a stream after a keyword trigger.
KeywordResult GetResult(const OnlineStream *s) const
Return the copied keyword spotting result for a stream.
RAII wrapper for linear resampling.
static LinearResampler Create(int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz, int32_t num_zeros)
Create a linear resampler.
LinearResampler()=default
Construct an empty wrapper.
int32_t GetInputSamplingRate() const
Return the input sample rate in Hz.
void Destroy(const SherpaOnnxLinearResampler *p) const
Destroy the wrapped C handle.
std::vector< float > Resample(const float *input, int32_t input_dim, bool flush) const
Resample one chunk of input audio.
int32_t GetOutputSamplingRate() const
Return the output sample rate in Hz.
void Reset() const
Reset the resampler state.
Base class for move-only RAII wrappers around C handles.
MoveOnly(const T *p)
Construct a wrapper from a raw C handle.
const T * Release()
Release ownership of the wrapped raw pointer.
MoveOnly(const MoveOnly &)=delete
~MoveOnly()
Destroy the wrapped handle if present.
MoveOnly()=default
Construct an empty wrapper.
MoveOnly & operator=(const MoveOnly &)=delete
MoveOnly & operator=(MoveOnly &&other)
MoveOnly(MoveOnly &&other)
const T * Get() const
Return the wrapped raw pointer without transferring ownership.
RAII wrapper for offline punctuation restoration.
static OfflinePunctuation Create(const OfflinePunctuationConfig &config)
Create an offline punctuation model.
std::string AddPunctuation(const std::string &text) const
Add punctuation to a complete input text.
void Destroy(const SherpaOnnxOfflinePunctuation *p) const
Destroy the wrapped C handle.
RAII wrapper for an offline recognizer.
std::shared_ptr< OfflineRecognizerResult > GetResultPtr(const OfflineStream *s) const
Convenience wrapper that returns the result inside a shared pointer.
static OfflineRecognizer Create(const OfflineRecognizerConfig &config)
Create an offline recognizer from a config struct.
void Decode(const OfflineStream *ss, int32_t n) const
Decode multiple offline streams in parallel.
void Decode(const OfflineStream *s) const
Decode one offline stream.
OfflineRecognizerResult GetResult(const OfflineStream *s) const
Return the copied recognition result for one stream.
void SetConfig(const OfflineRecognizerConfig &config) const
Update recognizer runtime configuration after creation.
OfflineStream CreateStream() const
Create a stream using the recognizer's configured hotwords.
void Destroy(const SherpaOnnxOfflineRecognizer *p) const
Destroy the wrapped C handle.
OfflineStream CreateStream(const std::string &hotwords) const
Create a stream with inline hotwords.
RAII wrapper for offline source separation.
void Destroy(const SherpaOnnxOfflineSourceSeparation *p) const
Destroy the wrapped C handle.
int32_t GetNumberOfStems() const
Return the number of stems produced.
int32_t GetOutputSampleRate() const
Return the output sample rate.
static OfflineSourceSeparation Create(const OfflineSourceSeparationConfig &config)
Create an offline source separation engine.
SourceSeparationOutput Process(const float *const *samples, int32_t num_channels, int32_t num_samples, int32_t sample_rate) const
Run source separation on multi-channel audio.
RAII wrapper for offline speech denoising.
DenoisedAudio Run(const float *samples, int32_t n, int32_t sample_rate) const
Run denoising on a complete waveform.
int32_t GetSampleRate() const
Return the expected input sample rate.
static OfflineSpeechDenoiser Create(const OfflineSpeechDenoiserConfig &config)
Create an offline speech denoiser.
void Destroy(const SherpaOnnxOfflineSpeechDenoiser *p) const
Destroy the wrapped C handle.
RAII wrapper for an offline decoding stream.
void SetOption(const char *key, const char *value) const
Set a per-stream string option.
const char * GetOption(const char *key) const
Get a per-stream string option.
void Destroy(const SherpaOnnxOfflineStream *p) const
Destroy the wrapped C handle.
void AcceptWaveform(int32_t sample_rate, const float *samples, int32_t n) const
Provide the complete waveform for offline decoding.
int32_t HasOption(const char *key) const
Check whether a per-stream option exists.
OfflineStream(const SherpaOnnxOfflineStream *p)
Wrap an existing C offline stream handle.
RAII wrapper for offline TTS.
GeneratedAudio Generate(const std::string &text, int32_t sid=0, float speed=1.0, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Generate speech using the simple speaker-id and speed interface.
GeneratedAudio Generate(const std::string &text, const GenerationConfig &config, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Generate speech using the advanced generation configuration.
int32_t NumSpeakers() const
Return the number of supported speakers.
static OfflineTts Create(const OfflineTtsConfig &config)
Create an offline TTS engine.
std::shared_ptr< GeneratedAudio > Generate2(const std::string &text, const GenerationConfig &config, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Like the advanced Generate() overload, but returns a shared pointer.
std::shared_ptr< GeneratedAudio > Generate2(const std::string &text, int32_t sid=0, float speed=1.0, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Like Generate(), but returns a shared pointer to the result.
void Destroy(const SherpaOnnxOfflineTts *p) const
Destroy the wrapped C handle.
int32_t SampleRate() const
Return the output sample rate of generated audio.
RAII wrapper for online punctuation restoration.
void Destroy(const SherpaOnnxOnlinePunctuation *p) const
Destroy the wrapped C handle.
static OnlinePunctuation Create(const OnlinePunctuationConfig &config)
Create an online punctuation model.
std::string AddPunctuation(const std::string &text) const
Add punctuation to one input text chunk.
RAII wrapper for a streaming recognizer.
void Decode(const OnlineStream *s) const
Decode one ready stream.
OnlineRecognizerResult GetResult(const OnlineStream *s) const
Return the current recognition result for a stream.
static OnlineRecognizer Create(const OnlineRecognizerConfig &config)
Create a streaming recognizer from a config struct.
void Reset(const OnlineStream *s) const
Reset a stream after endpointing or utterance completion.
OnlineStream CreateStream(const std::string &hotwords) const
Create a stream with inline hotwords.
void Destroy(const SherpaOnnxOnlineRecognizer *p) const
Destroy the wrapped C handle.
bool IsReady(const OnlineStream *s) const
Check whether the given stream has enough data to decode.
bool IsEndpoint(const OnlineStream *s) const
Check whether endpointing has triggered for a stream.
OnlineStream CreateStream() const
Create a stream that uses the recognizer's configured hotwords.
void Decode(const OnlineStream *ss, int32_t n) const
Decode multiple ready streams in parallel.
RAII wrapper for online speech denoising.
int32_t GetFrameShiftInSamples() const
Return the recommended frame shift in samples for streaming input.
DenoisedAudio Flush() const
Flush buffered audio and reset the denoiser.
DenoisedAudio Run(const float *samples, int32_t n, int32_t sample_rate) const
Process one chunk of streaming audio.
int32_t GetSampleRate() const
Return the expected input sample rate.
void Destroy(const SherpaOnnxOnlineSpeechDenoiser *p) const
Destroy the wrapped C handle.
void Reset() const
Reset the denoiser for a new stream.
static OnlineSpeechDenoiser Create(const OnlineSpeechDenoiserConfig &config)
Create an online speech denoiser.
void InputFinished() const
Indicate that no more input audio will be provided.
void AcceptWaveform(int32_t sample_rate, const float *samples, int32_t n) const
Append audio samples to the stream.
void Destroy(const SherpaOnnxOnlineStream *p) const
Destroy the wrapped C handle.
int32_t HasOption(const char *key) const
Check whether a per-stream option exists.
OnlineStream(const SherpaOnnxOnlineStream *p)
Wrap an existing C online stream handle.
void SetOption(const char *key, const char *value) const
Set a per-stream string option.
const char * GetOption(const char *key) const
Get a per-stream string option.
RAII wrapper for voice activity detection.
void Clear() const
Remove all queued speech segments.
void Pop() const
Remove the front queued speech segment.
void Destroy(const SherpaOnnxVoiceActivityDetector *p) const
Destroy the wrapped C handle.
std::shared_ptr< SpeechSegment > FrontPtr() const
Like Front(), but returns the segment in a shared pointer.
static VoiceActivityDetector Create(const VadModelConfig &config, float buffer_size_in_seconds)
Create a VAD instance.
void AcceptWaveform(const float *samples, int32_t n) const
Feed more audio samples to the detector.
bool IsEmpty() const
Check whether no speech segments are currently queued.
bool IsDetected() const
Check whether speech is currently detected.
void Flush() const
Flush buffered context at end of input.
SpeechSegment Front() const
Return the front queued speech segment.
void Reset() const
Reset the detector state.
std::string GetGitSha1()
Return the build Git SHA1 as a C++ string.
int32_t(*)(const float *samples, int32_t num_samples, float progress, void *arg) OfflineTtsCallback
TTS progress callback.
std::string GetVersionStr()
Return the sherpa-onnx version string as a C++ string.
Wave ReadWave(const std::string &filename)
Read a mono WAVE file into a C++ value object.
std::string GetGitDate()
Return the build Git date as a C++ string.
bool FileExists(const std::string &filename)
Return true if a file exists.
bool WriteWave(const std::string &filename, const Wave &wave)
Write a mono WAVE file from a C++ value object.
One audio-tagging event returned by the C++ wrapper.
Configuration for audio tagging.
AudioTaggingModelConfig model
Audio-tagging model configuration.
OfflineZipformerAudioTaggingModelConfig zipformer
Denoised waveform returned by speech enhancement wrappers.
std::vector< float > samples
Feature extraction settings shared by ASR and KWS wrappers.
Generated audio returned by the C++ TTS wrapper.
std::vector< float > samples
Generation-time options for advanced TTS synthesis.
std::unordered_map< std::string, std::string > extra
std::string reference_text
int32_t reference_sample_rate
std::vector< float > reference_audio
Homophone replacement resources used by some Chinese ASR setups.
Current keyword spotting result copied into C++ containers.
std::vector< float > timestamps
std::vector< std::string > tokens
Configuration for the C++ keyword spotting wrapper.
OnlineModelConfig model_config
std::string keywords_file
FeatureConfig feat_config
int32_t num_trailing_blanks
Offline Canary model configuration.
Offline Cohere Transcribe model configuration.
Offline Dolphin model file.
Offline FireRed ASR CTC model file.
Offline FireRed ASR model files.
Offline FunASR Nano model configuration.
std::string system_prompt
std::string encoder_adaptor
Optional language-model rescoring configuration for offline ASR.
Offline MedASR CTC model file.
Acoustic model configuration for offline ASR.
OfflineMedAsrCtcModelConfig medasr
OfflineTdnnModelConfig tdnn
OfflineParaformerModelConfig paraformer
OfflineSenseVoiceModelConfig sense_voice
OfflineZipformerCtcModelConfig zipformer_ctc
std::string modeling_unit
OfflineFireRedAsrCtcModelConfig fire_red_asr_ctc
OfflineMoonshineModelConfig moonshine
OfflineCanaryModelConfig canary
OfflineWhisperModelConfig whisper
OfflineCohereTranscribeModelConfig cohere_transcribe
OfflineDolphinModelConfig dolphin
std::string telespeech_ctc
OfflineWenetCtcModelConfig wenet_ctc
OfflineFunASRNanoModelConfig funasr_nano
OfflineFireRedAsrModelConfig fire_red_asr
OfflineTransducerModelConfig transducer
OfflineNemoEncDecCtcModelConfig nemo_ctc
OfflineQwen3ASRModelConfig qwen3_asr
OfflineOmnilingualAsrCtcModelConfig omnilingual
Offline Moonshine model configuration.
std::string cached_decoder
std::string uncached_decoder
std::string merged_decoder
Offline NeMo EncDec CTC model file.
Offline omnilingual ASR CTC model file.
Configuration for offline punctuation.
OfflinePunctuationModelConfig model
Offline punctuation model configuration.
std::string ct_transformer
Offline Qwen3-ASR model configuration.
std::string conv_frontend
Configuration for offline ASR.
OfflineLMConfig lm_config
FeatureConfig feat_config
OfflineModelConfig model_config
HomophoneReplacerConfig hr
std::string decoding_method
std::string hotwords_file
Offline ASR result copied into C++ containers.
std::vector< float > durations
std::vector< float > timestamps
std::vector< std::string > tokens
Offline SenseVoice model configuration.
Configuration for offline source separation.
OfflineSourceSeparationModelConfig model
Source-separation model configuration.
OfflineSourceSeparationUvrModelConfig uvr
OfflineSourceSeparationSpleeterModelConfig spleeter
Spleeter source-separation model configuration.
std::string accompaniment
UVR (MDX-Net) source-separation model configuration.
Configuration for offline speech denoising.
OfflineSpeechDenoiserModelConfig model
DPDFNet speech denoiser model configuration.
GTCRN speech denoiser model configuration.
Speech denoiser model configuration.
OfflineSpeechDenoiserGtcrnModelConfig gtcrn
OfflineSpeechDenoiserDpdfNetModelConfig dpdfnet
Offline transducer model files.
Configuration for offline TTS.
OfflineTtsModelConfig model
int32_t max_num_sentences
Kitten model configuration.
Kokoro model configuration.
Matcha model configuration.
std::string acoustic_model
Model configuration for offline TTS.
OfflineTtsKittenModelConfig kitten
OfflineTtsSupertonicModelConfig supertonic
OfflineTtsVitsModelConfig vits
OfflineTtsKokoroModelConfig kokoro
OfflineTtsMatchaModelConfig matcha
OfflineTtsPocketModelConfig pocket
OfflineTtsZipvoiceModelConfig zipvoice
Pocket TTS model configuration.
std::string token_scores_json
std::string text_conditioner
int32_t voice_embedding_cache_capacity
Supertonic model configuration.
std::string vector_estimator
std::string duration_predictor
std::string unicode_indexer
VITS model configuration.
ZipVoice model configuration.
Offline WeNet CTC model file.
Offline Whisper model configuration.
bool enable_segment_timestamps
bool enable_token_timestamps
Decoder graph configuration for online CTC + FST decoding.
Acoustic model configuration for streaming ASR.
OnlineNemoCtcModelConfig nemo_ctc
OnlineZipformer2CtcModelConfig zipformer2_ctc
std::string modeling_unit
OnlineToneCtcModelConfig t_one_ctc
OnlineParaformerModelConfig paraformer
OnlineTransducerModelConfig transducer
Streaming NeMo CTC model file.
Configuration for online punctuation.
OnlinePunctuationModelConfig model
Online punctuation model configuration.
Configuration for streaming ASR.
std::string hotwords_file
OnlineModelConfig model_config
OnlineCtcFstDecoderConfig ctc_fst_decoder_config
FeatureConfig feat_config
float rule3_min_utterance_length
std::string decoding_method
HomophoneReplacerConfig hr
float rule1_min_trailing_silence
float rule2_min_trailing_silence
Current streaming ASR result copied into C++ containers.
std::vector< std::string > tokens
std::vector< float > timestamps
Configuration for online speech denoising.
OfflineSpeechDenoiserModelConfig model
Streaming T-One CTC model file.
Streaming transducer model files.
Silero VAD model configuration.
float min_silence_duration
float max_speech_duration
float min_speech_duration
Output of a source-separation run.
std::vector< SourceSeparationStem > stems
A single stem (output track) with one or more channels.
std::vector< std::vector< float > > samples
One speech segment produced by the VAD wrapper.
std::vector< float > samples
Ten VAD model configuration.
float min_silence_duration
float min_speech_duration
float max_speech_duration
SileroVadModelConfig silero_vad
TenVadModelConfig ten_vad
Mono PCM waveform used by the helper I/O functions.
std::vector< float > samples