sherpa-onnx C API 1.0
Public C API and C++ wrapper for sherpa-onnx
Loading...
Searching...
No Matches
cxx-api.h
Go to the documentation of this file.
1// sherpa-onnx/c-api/cxx-api.h
2//
3// Copyright (c) 2024 Xiaomi Corporation
44#ifndef SHERPA_ONNX_C_API_CXX_API_H_
45#define SHERPA_ONNX_C_API_CXX_API_H_
46
47#include <functional>
48#include <memory>
49#include <string>
50#include <unordered_map>
51#include <vector>
52
54
56
57// ============================================================================
58// Streaming ASR
59// ============================================================================
63 std::string encoder;
65 std::string decoder;
67 std::string joiner;
68};
69
73 std::string encoder;
75 std::string decoder;
76};
77
81 std::string model;
82};
83
87 std::string model;
88};
89
93 std::string model;
94};
95
150
154 int32_t sample_rate = 16000;
156 int32_t feature_dim = 80;
157};
158
162 std::string graph;
164 int32_t max_active = 3000;
165};
166
170 std::string dict_dir;
172 std::string lexicon;
174 std::string rule_fsts;
175};
176
244
248 std::string text;
250 std::vector<std::string> tokens;
252 std::vector<float> timestamps;
254 std::string json;
255};
256
258struct Wave {
260 std::vector<float> samples;
262 int32_t sample_rate = 0;
263};
264
273SHERPA_ONNX_API Wave ReadWave(const std::string &filename);
274
282SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave);
283
290template <typename Derived, typename T>
292 public:
294 MoveOnly() = default;
296 explicit MoveOnly(const T *p) : p_(p) {}
297
299 ~MoveOnly() { Destroy(); }
300
301 MoveOnly(const MoveOnly &) = delete;
302
303 MoveOnly &operator=(const MoveOnly &) = delete;
304
305 MoveOnly(MoveOnly &&other) : p_(other.Release()) {}
306
308 if (&other == this) {
309 return *this;
310 }
311
312 Destroy();
313
314 p_ = other.Release();
315
316 return *this;
317 }
318
320 const T *Get() const { return p_; }
321
323 const T *Release() {
324 const T *p = p_;
325 p_ = nullptr;
326 return p;
327 }
328
329 private:
330 void Destroy() {
331 if (p_ == nullptr) {
332 return;
333 }
334
335 static_cast<Derived *>(this)->Destroy(p_);
336
337 p_ = nullptr;
338 }
339
340 protected:
341 const T *p_ = nullptr;
342};
343
345 : public MoveOnly<OnlineStream, SherpaOnnxOnlineStream> {
346 public:
349
351 void AcceptWaveform(int32_t sample_rate, const float *samples,
352 int32_t n) const;
353
355 void InputFinished() const;
356
358 void SetOption(const char *key, const char *value) const;
360 const char *GetOption(const char *key) const;
362 int32_t HasOption(const char *key) const;
363
365 void Destroy(const SherpaOnnxOnlineStream *p) const;
366};
367
386 : public MoveOnly<OnlineRecognizer, SherpaOnnxOnlineRecognizer> {
387 public:
390
393
396
398 OnlineStream CreateStream(const std::string &hotwords) const;
399
401 bool IsReady(const OnlineStream *s) const;
402
404 void Decode(const OnlineStream *s) const;
405
407 void Decode(const OnlineStream *ss, int32_t n) const;
408
411
413 void Reset(const OnlineStream *s) const;
414
416 bool IsEndpoint(const OnlineStream *s) const;
417
418 private:
420};
421
422// ============================================================================
423// Non-streaming ASR
424// ============================================================================
428 std::string encoder;
430 std::string decoder;
432 std::string joiner;
433};
434
438 std::string model;
439};
440
444 std::string model;
445};
446
450 std::string encoder;
452 std::string decoder;
454 std::string language;
456 std::string task = "transcribe";
458 int32_t tail_paddings = -1;
463};
464
468 std::string encoder;
470 std::string decoder;
472 std::string src_lang;
474 std::string tgt_lang;
476 bool use_pnc = true;
477};
478
482 std::string encoder;
484 std::string decoder;
486 std::string language;
488 bool use_punct = true;
490 bool use_itn = true;
491};
492
496 std::string encoder;
498 std::string decoder;
499};
500
504 std::string model;
505};
506
510 std::string model;
511};
512
516 std::string model;
518 std::string language;
520 bool use_itn = false;
521};
522
526 std::string model;
527};
528
532 std::string model;
533};
534
538 std::string model;
539};
540
546
550 std::string model;
551};
552
556 std::string preprocessor;
558 std::string encoder;
560 std::string uncached_decoder;
562 std::string cached_decoder;
564 std::string merged_decoder;
565};
566
570 std::string encoder_adaptor;
572 std::string llm;
574 std::string embedding;
576 std::string tokenizer;
578 std::string system_prompt = "You are a helpful assistant.";
580 std::string user_prompt = "语音转写:";
582 int32_t max_new_tokens = 512;
584 float temperature = 1e-6f;
586 float top_p = 0.8f;
588 int32_t seed = 42;
590 std::string language;
592 bool itn = true;
594 std::string hotwords;
595};
596
600 std::string conv_frontend;
602 std::string encoder;
604 std::string decoder;
606 std::string tokenizer;
609 std::string hotwords;
611 int32_t max_total_len = 512;
613 int32_t max_new_tokens = 128;
615 float temperature = 1e-6f;
617 float top_p = 0.8f;
619 int32_t seed = 42;
620};
621
683
687 std::string model;
689 float scale = 1.0;
690};
691
751
755 std::string text;
757 std::vector<float> timestamps;
759 std::vector<std::string> tokens;
761 std::string json;
763 std::string lang;
765 std::string emotion;
767 std::string event;
768
770 std::vector<float> durations;
771};
772
775 : public MoveOnly<OfflineStream, SherpaOnnxOfflineStream> {
776 public:
779
781 void AcceptWaveform(int32_t sample_rate, const float *samples,
782 int32_t n) const;
783
785 void SetOption(const char *key, const char *value) const;
787 const char *GetOption(const char *key) const;
789 int32_t HasOption(const char *key) const;
790
792 void Destroy(const SherpaOnnxOfflineStream *p) const;
793};
794
802 : public MoveOnly<OfflineRecognizer, SherpaOnnxOfflineRecognizer> {
803 public:
806
809
812
814 OfflineStream CreateStream(const std::string &hotwords) const;
815
817 void Decode(const OfflineStream *s) const;
818
820 void Decode(const OfflineStream *ss, int32_t n) const;
821
824
831 std::shared_ptr<OfflineRecognizerResult> GetResultPtr(
832 const OfflineStream *s) const;
833
835 void SetConfig(const OfflineRecognizerConfig &config) const;
836
837 private:
839};
840
841// ============================================================================
842// Non-streaming TTS
843// ============================================================================
847 std::string model;
849 std::string lexicon;
851 std::string tokens;
853 std::string data_dir;
855 std::string dict_dir;
856
858 float noise_scale = 0.667;
860 float noise_scale_w = 0.8;
862 float length_scale = 1.0;
863};
864
868 std::string acoustic_model;
870 std::string vocoder;
872 std::string lexicon;
874 std::string tokens;
876 std::string data_dir;
878 std::string dict_dir;
879
881 float noise_scale = 0.667;
883 float length_scale = 1.0;
884};
885
889 std::string model;
891 std::string voices;
893 std::string tokens;
895 std::string data_dir;
897 std::string dict_dir;
899 std::string lexicon;
901 std::string lang;
902
904 float length_scale = 1.0;
905};
906
910 std::string model;
912 std::string voices;
914 std::string tokens;
916 std::string data_dir;
917
919 float length_scale = 1.0;
920};
921
925 std::string tokens;
927 std::string encoder;
929 std::string decoder;
931 std::string vocoder;
933 std::string data_dir;
935 std::string lexicon;
936
938 float feat_scale = 0.1;
940 float t_shift = 0.5;
942 float target_rms = 0.1;
944 float guidance_scale = 1.0;
945};
946
950 std::string lm_flow;
952 std::string lm_main;
954 std::string encoder;
956 std::string decoder;
958 std::string text_conditioner;
959
961 std::string vocab_json;
963 std::string token_scores_json;
966};
967
973 std::string text_encoder;
975 std::string vector_estimator;
977 std::string vocoder;
979 std::string tts_json;
981 std::string unicode_indexer;
983 std::string voice_style;
984};
985
1015
1019 float silence_scale = 0.2;
1021 float speed = 1.0;
1023 int32_t sid = 0;
1025 std::vector<float> reference_audio;
1029 std::string reference_text;
1031 int32_t num_steps = 5;
1032
1034 std::unordered_map<std::string, std::string> extra;
1035};
1036
1050
1054 std::vector<float> samples;
1056 int32_t sample_rate = 0;
1057};
1058
1064using OfflineTtsCallback = int32_t (*)(const float *samples,
1065 int32_t num_samples, float progress,
1066 void *arg);
1067
1092 : public MoveOnly<OfflineTts, SherpaOnnxOfflineTts> {
1093 public:
1095 static OfflineTts Create(const OfflineTtsConfig &config);
1096
1098 void Destroy(const SherpaOnnxOfflineTts *p) const;
1099
1101 int32_t SampleRate() const;
1102
1104 int32_t NumSpeakers() const;
1105
1112 GeneratedAudio Generate(const std::string &text, int32_t sid = 0,
1113 float speed = 1.0,
1114 OfflineTtsCallback callback = nullptr,
1115 void *arg = nullptr) const;
1116
1118 GeneratedAudio Generate(const std::string &text,
1119 const GenerationConfig &config,
1120 OfflineTtsCallback callback = nullptr,
1121 void *arg = nullptr) const;
1122
1124 std::shared_ptr<GeneratedAudio> Generate2(
1125 const std::string &text, int32_t sid = 0, float speed = 1.0,
1126 OfflineTtsCallback callback = nullptr, void *arg = nullptr) const;
1127
1130 std::shared_ptr<GeneratedAudio> Generate2(
1131 const std::string &text, const GenerationConfig &config,
1132 OfflineTtsCallback callback = nullptr, void *arg = nullptr) const;
1133
1134 private:
1135 explicit OfflineTts(const SherpaOnnxOfflineTts *p);
1136};
1137
1138// ============================================================
1139// For Keyword Spotter
1140// ============================================================
1141
1145 std::string keyword;
1147 std::vector<std::string> tokens;
1149 std::vector<float> timestamps;
1151 float start_time = 0.0f;
1153 std::string json;
1154};
1155
1175
1178 : public MoveOnly<KeywordSpotter, SherpaOnnxKeywordSpotter> {
1179 public:
1182
1184 void Destroy(const SherpaOnnxKeywordSpotter *p) const;
1185
1188
1191 OnlineStream CreateStream(const std::string &keywords) const;
1192
1194 bool IsReady(const OnlineStream *s) const;
1195
1197 void Decode(const OnlineStream *s) const;
1198
1200 void Decode(const OnlineStream *ss, int32_t n) const;
1201
1203 void Reset(const OnlineStream *s) const;
1204
1207
1208 private:
1209 explicit KeywordSpotter(const SherpaOnnxKeywordSpotter *p);
1210};
1211
1217
1223
1242
1248
1252 std::vector<float> samples;
1254 int32_t sample_rate = 0;
1255};
1256
1259 : public MoveOnly<OfflineSpeechDenoiser, SherpaOnnxOfflineSpeechDenoiser> {
1260 public:
1263 const OfflineSpeechDenoiserConfig &config);
1264
1267
1269 DenoisedAudio Run(const float *samples, int32_t n, int32_t sample_rate) const;
1270
1272 int32_t GetSampleRate() const;
1273
1274 private:
1276};
1277
1283
1286 : public MoveOnly<OnlineSpeechDenoiser, SherpaOnnxOnlineSpeechDenoiser> {
1287 public:
1290
1293
1295 DenoisedAudio Run(const float *samples, int32_t n, int32_t sample_rate) const;
1296
1299
1301 void Reset() const;
1302
1304 int32_t GetSampleRate() const;
1305
1308 int32_t GetFrameShiftInSamples() const;
1309
1310 private:
1312};
1313
1314// ==============================
1315// VAD
1316// ==============================
1317
1321 std::string model;
1323 float threshold = 0.5;
1329 int32_t window_size = 512;
1332};
1333
1337 std::string model;
1339 float threshold = 0.5;
1345 int32_t window_size = 256;
1348};
1349
1361
1363 int32_t sample_rate = 16000;
1365 int32_t num_threads = 1;
1367 std::string provider = "cpu";
1369 bool debug = false;
1370};
1371
1375 int32_t start = 0;
1377 std::vector<float> samples;
1378};
1379
1382 : public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> {
1383 public:
1385 static CircularBuffer Create(int32_t capacity);
1386
1388 void Destroy(const SherpaOnnxCircularBuffer *p) const;
1389
1391 void Push(const float *p, int32_t n) const;
1392
1394 std::vector<float> Get(int32_t start_index, int32_t n) const;
1395
1397 void Pop(int32_t n) const;
1398
1400 int32_t Size() const;
1401
1403 int32_t Head() const;
1404
1406 void Reset() const;
1407
1408 private:
1409 explicit CircularBuffer(const SherpaOnnxCircularBuffer *p);
1410};
1411
1419 : public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> {
1420 public:
1423 float buffer_size_in_seconds);
1424
1427
1429 void AcceptWaveform(const float *samples, int32_t n) const;
1430
1432 bool IsEmpty() const;
1433
1435 bool IsDetected() const;
1436
1438 void Pop() const;
1439
1441 void Clear() const;
1442
1445
1447 std::shared_ptr<SpeechSegment> FrontPtr() const;
1448
1450 void Reset() const;
1451
1453 void Flush() const;
1454
1455 private:
1457};
1458
1461 : public MoveOnly<LinearResampler, SherpaOnnxLinearResampler> {
1462 public:
1464 LinearResampler() = default;
1466 static LinearResampler Create(int32_t samp_rate_in_hz,
1467 int32_t samp_rate_out_hz,
1468 float filter_cutoff_hz, int32_t num_zeros);
1469
1472
1474 void Reset() const;
1475
1477 std::vector<float> Resample(const float *input, int32_t input_dim,
1478 bool flush) const;
1479
1481 int32_t GetInputSamplingRate() const;
1483 int32_t GetOutputSamplingRate() const;
1484
1485 private:
1486 explicit LinearResampler(const SherpaOnnxLinearResampler *p);
1487};
1488
1496SHERPA_ONNX_API bool FileExists(const std::string &filename);
1497
1498// ============================================================================
1499// Offline Punctuation
1500// ============================================================================
1504 std::string ct_transformer;
1506 int32_t num_threads = 1;
1508 bool debug = false;
1510 std::string provider = "cpu";
1511};
1512
1518
1521 : public MoveOnly<OfflinePunctuation, SherpaOnnxOfflinePunctuation> {
1522 public:
1525
1528
1530 std::string AddPunctuation(const std::string &text) const;
1531
1532 private:
1534};
1535
1536// ============================================================================
1537// Online Punctuation
1538// ============================================================================
1542 std::string cnn_bilstm;
1544 std::string bpe_vocab;
1546 int32_t num_threads = 1;
1548 bool debug = false;
1550 std::string provider = "cpu";
1551};
1552
1558
1561 : public MoveOnly<OnlinePunctuation, SherpaOnnxOnlinePunctuation> {
1562 public:
1565
1568
1570 std::string AddPunctuation(const std::string &text) const;
1571
1572 private:
1574};
1575
1576// ============================================================================
1577// Audio tagging
1578// ============================================================================
1584
1603
1613
1617 std::string name;
1619 int32_t index;
1621 float prob;
1622};
1623
1626 : public MoveOnly<AudioTagging, SherpaOnnxAudioTagging> {
1627 public:
1630
1632 void Destroy(const SherpaOnnxAudioTagging *p) const;
1633
1642 std::vector<AudioEvent> Compute(const OfflineStream *s, int32_t top_k = -1);
1643
1646 std::shared_ptr<std::vector<AudioEvent>> ComputePtr(const OfflineStream *s,
1647 int32_t top_k = -1);
1648
1649 private:
1650 explicit AudioTagging(const SherpaOnnxAudioTagging *p);
1651};
1652
1653// ==============================
1654// Source Separation
1655// ==============================
1656
1664
1670
1688
1694
1698 std::vector<std::vector<float>> samples;
1699};
1700
1704 std::vector<SourceSeparationStem> stems;
1706 int32_t sample_rate = 0;
1707};
1708
1711 : public MoveOnly<OfflineSourceSeparation,
1712 SherpaOnnxOfflineSourceSeparation> {
1713 public:
1716 const OfflineSourceSeparationConfig &config);
1717
1720
1730 SourceSeparationOutput Process(const float *const *samples,
1731 int32_t num_channels, int32_t num_samples,
1732 int32_t sample_rate) const;
1733
1735 int32_t GetOutputSampleRate() const;
1736
1738 int32_t GetNumberOfStems() const;
1739
1740 private:
1742};
1743
1744// ============================================================================
1745// Spoken Language Identification
1746// ============================================================================
1747
1751 std::string encoder;
1753 std::string decoder;
1755 int32_t tail_paddings = 0;
1756};
1757
1769
1775
1778 : public MoveOnly<SpokenLanguageIdentification,
1779 SherpaOnnxSpokenLanguageIdentification> {
1780 public:
1784
1787
1790
1793
1794 private:
1797};
1798
1799// ============================================================================
1800// Speaker Embedding Extractor
1801// ============================================================================
1802
1806 std::string model;
1808 int32_t num_threads = 1;
1810 bool debug = false;
1812 std::string provider = "cpu";
1813};
1814
1817 : public MoveOnly<SpeakerEmbeddingExtractor,
1818 SherpaOnnxSpeakerEmbeddingExtractor> {
1819 public:
1822 const SpeakerEmbeddingExtractorConfig &config);
1823
1826
1828 int32_t Dim() const;
1829
1832
1834 bool IsReady(const OnlineStream *s) const;
1835
1837 std::vector<float> ComputeEmbedding(const OnlineStream *s) const;
1838
1839 private:
1842};
1843
1844// ============================================================================
1845// Speaker Embedding Manager
1846// ============================================================================
1847
1851 float score;
1853 std::string name;
1854};
1855
1858 : public MoveOnly<SpeakerEmbeddingManager,
1859 SherpaOnnxSpeakerEmbeddingManager> {
1860 public:
1862 static SpeakerEmbeddingManager Create(int32_t dim);
1863
1866
1868 bool Add(const std::string &name, const float *v) const;
1869
1871 bool AddList(const std::string &name, const float **v) const;
1872
1874 bool AddListFlattened(const std::string &name, const float *v,
1875 int32_t n) const;
1876
1878 bool Remove(const std::string &name) const;
1879
1881 std::string Search(const float *v, float threshold) const;
1882
1884 std::vector<SpeakerMatch> GetBestMatches(const float *v, float threshold,
1885 int32_t n) const;
1886
1888 bool Verify(const std::string &name, const float *v, float threshold) const;
1889
1891 bool Contains(const std::string &name) const;
1892
1894 int32_t NumSpeakers() const;
1895
1897 std::vector<std::string> GetAllSpeakers() const;
1898
1899 private:
1901};
1902
1903// ============================================================================
1904// Offline Speaker Diarization
1905// ============================================================================
1906
1912
1924
1929 int32_t num_clusters = 0;
1931 float threshold = 0.5;
1932};
1933
1947
1957
1960 std::function<void(int32_t num_processed_chunks, int32_t num_total_chunks)>;
1961
1964 : public MoveOnly<OfflineSpeakerDiarization,
1965 SherpaOnnxOfflineSpeakerDiarization> {
1966 public:
1969 const OfflineSpeakerDiarizationConfig &config);
1970
1973
1975 int32_t GetSampleRate() const;
1976
1979
1981 std::vector<OfflineSpeakerDiarizationSegment> Process(
1982 const float *samples, int32_t n) const;
1983
1985 std::vector<OfflineSpeakerDiarizationSegment> Process(
1986 const float *samples, int32_t n,
1987 const OfflineSpeakerDiarizationProgressCallback &callback) const;
1988
1989 private:
1992};
1993
1994} // namespace sherpa_onnx::cxx
1995
1996#endif // SHERPA_ONNX_C_API_CXX_API_H_
Public C API for sherpa-onnx.
struct SherpaOnnxSpokenLanguageIdentification SherpaOnnxSpokenLanguageIdentification
Opaque spoken-language identification handle.
Definition c-api.h:2979
struct SherpaOnnxOfflineSpeechDenoiser SherpaOnnxOfflineSpeechDenoiser
Opaque offline speech denoiser handle.
Definition c-api.h:4119
struct SherpaOnnxOfflineStream SherpaOnnxOfflineStream
Non-streaming decoding state for one utterance.
Definition c-api.h:1199
#define SHERPA_ONNX_API
Definition c-api.h:106
struct SherpaOnnxOnlineSpeechDenoiser SherpaOnnxOnlineSpeechDenoiser
Opaque online speech denoiser handle.
Definition c-api.h:4217
struct SherpaOnnxOnlineStream SherpaOnnxOnlineStream
Streaming decoding state for one utterance or stream.
Definition c-api.h:426
struct SherpaOnnxOfflineRecognizer SherpaOnnxOfflineRecognizer
Non-streaming recognizer handle.
Definition c-api.h:1196
struct SherpaOnnxSpeakerEmbeddingManager SherpaOnnxSpeakerEmbeddingManager
Opaque speaker embedding manager handle.
Definition c-api.h:3204
struct SherpaOnnxKeywordSpotter SherpaOnnxKeywordSpotter
Opaque keyword spotter handle.
Definition c-api.h:1707
struct SherpaOnnxOnlinePunctuation SherpaOnnxOnlinePunctuation
Opaque online punctuation handle.
Definition c-api.h:3674
struct SherpaOnnxOnlineRecognizer SherpaOnnxOnlineRecognizer
Streaming recognizer handle.
Definition c-api.h:424
struct SherpaOnnxCircularBuffer SherpaOnnxCircularBuffer
Opaque circular-buffer handle used by helper APIs.
Definition c-api.h:1963
struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts
Opaque offline TTS handle.
Definition c-api.h:2507
struct SherpaOnnxLinearResampler SherpaOnnxLinearResampler
Opaque linear resampler handle.
Definition c-api.h:3723
struct SherpaOnnxOfflineSourceSeparation SherpaOnnxOfflineSourceSeparation
Opaque source-separation engine handle.
Definition c-api.h:4331
struct SherpaOnnxOfflineSpeakerDiarization SherpaOnnxOfflineSpeakerDiarization
Opaque offline speaker diarization handle.
Definition c-api.h:3892
struct SherpaOnnxSpeakerEmbeddingExtractor SherpaOnnxSpeakerEmbeddingExtractor
Opaque speaker embedding extractor handle.
Definition c-api.h:3099
struct SherpaOnnxAudioTagging SherpaOnnxAudioTagging
Opaque audio tagger handle.
Definition c-api.h:3493
struct SherpaOnnxOfflinePunctuation SherpaOnnxOfflinePunctuation
Opaque offline punctuation handle.
Definition c-api.h:3597
struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector
Opaque voice activity detector handle.
Definition c-api.h:2082
RAII wrapper for audio tagging.
Definition cxx-api.h:1626
std::vector< AudioEvent > Compute(const OfflineStream *s, int32_t top_k=-1)
Run audio tagging and return copied results.
OfflineStream CreateStream() const
Create an offline stream for tagging.
std::shared_ptr< std::vector< AudioEvent > > ComputePtr(const OfflineStream *s, int32_t top_k=-1)
Like Compute(), but returns the result vector in a shared pointer.
void Destroy(const SherpaOnnxAudioTagging *p) const
Destroy the wrapped C handle.
static AudioTagging Create(const AudioTaggingConfig &config)
Create an audio tagger.
RAII wrapper for the circular buffer helper used by VAD.
Definition cxx-api.h:1382
static CircularBuffer Create(int32_t capacity)
Create a circular buffer with the given capacity in samples.
int32_t Head() const
Return the current head index.
void Destroy(const SherpaOnnxCircularBuffer *p) const
Destroy the wrapped C handle.
std::vector< float > Get(int32_t start_index, int32_t n) const
Copy a contiguous span from the buffer.
void Push(const float *p, int32_t n) const
Append samples to the buffer.
int32_t Size() const
Return the number of stored samples.
void Reset() const
Reset the buffer to empty.
void Pop(int32_t n) const
Remove samples from the head of the buffer.
RAII wrapper for keyword spotting.
Definition cxx-api.h:1178
OnlineStream CreateStream(const std::string &keywords) const
Create a keyword stream with inline extra or replacement keywords.
bool IsReady(const OnlineStream *s) const
Check whether the stream has enough data to decode.
void Decode(const OnlineStream *ss, int32_t n) const
Decode multiple ready streams in parallel.
void Destroy(const SherpaOnnxKeywordSpotter *p) const
Destroy the wrapped C handle.
OnlineStream CreateStream() const
Create a keyword stream using configured keywords.
void Decode(const OnlineStream *s) const
Decode one ready stream.
static KeywordSpotter Create(const KeywordSpotterConfig &config)
Create a keyword spotter from a config struct.
void Reset(const OnlineStream *s) const
Reset a stream after a keyword trigger.
KeywordResult GetResult(const OnlineStream *s) const
Return the copied keyword spotting result for a stream.
RAII wrapper for linear resampling.
Definition cxx-api.h:1461
static LinearResampler Create(int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz, int32_t num_zeros)
Create a linear resampler.
LinearResampler()=default
Construct an empty wrapper.
int32_t GetInputSamplingRate() const
Return the input sample rate in Hz.
void Destroy(const SherpaOnnxLinearResampler *p) const
Destroy the wrapped C handle.
std::vector< float > Resample(const float *input, int32_t input_dim, bool flush) const
Resample one chunk of input audio.
int32_t GetOutputSamplingRate() const
Return the output sample rate in Hz.
void Reset() const
Reset the resampler state.
Base class for move-only RAII wrappers around C handles.
Definition cxx-api.h:291
MoveOnly(const T *p)
Construct a wrapper from a raw C handle.
Definition cxx-api.h:296
const T * Release()
Release ownership of the wrapped raw pointer.
Definition cxx-api.h:323
MoveOnly(const MoveOnly &)=delete
~MoveOnly()
Destroy the wrapped handle if present.
Definition cxx-api.h:299
MoveOnly()=default
Construct an empty wrapper.
MoveOnly & operator=(const MoveOnly &)=delete
MoveOnly & operator=(MoveOnly &&other)
Definition cxx-api.h:307
MoveOnly(MoveOnly &&other)
Definition cxx-api.h:305
const T * Get() const
Return the wrapped raw pointer without transferring ownership.
Definition cxx-api.h:320
RAII wrapper for offline punctuation restoration.
Definition cxx-api.h:1521
static OfflinePunctuation Create(const OfflinePunctuationConfig &config)
Create an offline punctuation model.
std::string AddPunctuation(const std::string &text) const
Add punctuation to a complete input text.
void Destroy(const SherpaOnnxOfflinePunctuation *p) const
Destroy the wrapped C handle.
RAII wrapper for an offline recognizer.
Definition cxx-api.h:802
std::shared_ptr< OfflineRecognizerResult > GetResultPtr(const OfflineStream *s) const
Convenience wrapper that returns the result inside a shared pointer.
static OfflineRecognizer Create(const OfflineRecognizerConfig &config)
Create an offline recognizer from a config struct.
void Decode(const OfflineStream *ss, int32_t n) const
Decode multiple offline streams in parallel.
void Decode(const OfflineStream *s) const
Decode one offline stream.
OfflineRecognizerResult GetResult(const OfflineStream *s) const
Return the copied recognition result for one stream.
void SetConfig(const OfflineRecognizerConfig &config) const
Update recognizer runtime configuration after creation.
OfflineStream CreateStream() const
Create a stream using the recognizer's configured hotwords.
void Destroy(const SherpaOnnxOfflineRecognizer *p) const
Destroy the wrapped C handle.
OfflineStream CreateStream(const std::string &hotwords) const
Create a stream with inline hotwords.
RAII wrapper for offline source separation.
Definition cxx-api.h:1712
void Destroy(const SherpaOnnxOfflineSourceSeparation *p) const
Destroy the wrapped C handle.
int32_t GetNumberOfStems() const
Return the number of stems produced.
int32_t GetOutputSampleRate() const
Return the output sample rate.
static OfflineSourceSeparation Create(const OfflineSourceSeparationConfig &config)
Create an offline source separation engine.
SourceSeparationOutput Process(const float *const *samples, int32_t num_channels, int32_t num_samples, int32_t sample_rate) const
Run source separation on multi-channel audio.
RAII wrapper for offline speaker diarization.
Definition cxx-api.h:1965
void SetConfig(const OfflineSpeakerDiarizationConfig &config) const
Update clustering-related settings.
int32_t GetSampleRate() const
Return the expected input sample rate.
void Destroy(const SherpaOnnxOfflineSpeakerDiarization *p) const
Destroy the wrapped C handle.
std::vector< OfflineSpeakerDiarizationSegment > Process(const float *samples, int32_t n) const
Run offline speaker diarization.
static OfflineSpeakerDiarization Create(const OfflineSpeakerDiarizationConfig &config)
Create an offline speaker diarization pipeline.
std::vector< OfflineSpeakerDiarizationSegment > Process(const float *samples, int32_t n, const OfflineSpeakerDiarizationProgressCallback &callback) const
Run offline speaker diarization with a progress callback.
RAII wrapper for offline speech denoising.
Definition cxx-api.h:1259
DenoisedAudio Run(const float *samples, int32_t n, int32_t sample_rate) const
Run denoising on a complete waveform.
int32_t GetSampleRate() const
Return the expected input sample rate.
static OfflineSpeechDenoiser Create(const OfflineSpeechDenoiserConfig &config)
Create an offline speech denoiser.
void Destroy(const SherpaOnnxOfflineSpeechDenoiser *p) const
Destroy the wrapped C handle.
RAII wrapper for an offline decoding stream.
Definition cxx-api.h:775
void SetOption(const char *key, const char *value) const
Set a per-stream string option.
const char * GetOption(const char *key) const
Get a per-stream string option.
void Destroy(const SherpaOnnxOfflineStream *p) const
Destroy the wrapped C handle.
void AcceptWaveform(int32_t sample_rate, const float *samples, int32_t n) const
Provide the complete waveform for offline decoding.
int32_t HasOption(const char *key) const
Check whether a per-stream option exists.
OfflineStream(const SherpaOnnxOfflineStream *p)
Wrap an existing C offline stream handle.
RAII wrapper for offline TTS.
Definition cxx-api.h:1092
GeneratedAudio Generate(const std::string &text, int32_t sid=0, float speed=1.0, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Generate speech using the simple speaker-id and speed interface.
GeneratedAudio Generate(const std::string &text, const GenerationConfig &config, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Generate speech using the advanced generation configuration.
int32_t NumSpeakers() const
Return the number of supported speakers.
static OfflineTts Create(const OfflineTtsConfig &config)
Create an offline TTS engine.
std::shared_ptr< GeneratedAudio > Generate2(const std::string &text, const GenerationConfig &config, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Like the advanced Generate() overload, but returns a shared pointer.
std::shared_ptr< GeneratedAudio > Generate2(const std::string &text, int32_t sid=0, float speed=1.0, OfflineTtsCallback callback=nullptr, void *arg=nullptr) const
Like Generate(), but returns a shared pointer to the result.
void Destroy(const SherpaOnnxOfflineTts *p) const
Destroy the wrapped C handle.
int32_t SampleRate() const
Return the output sample rate of generated audio.
RAII wrapper for online punctuation restoration.
Definition cxx-api.h:1561
void Destroy(const SherpaOnnxOnlinePunctuation *p) const
Destroy the wrapped C handle.
static OnlinePunctuation Create(const OnlinePunctuationConfig &config)
Create an online punctuation model.
std::string AddPunctuation(const std::string &text) const
Add punctuation to one input text chunk.
RAII wrapper for a streaming recognizer.
Definition cxx-api.h:386
void Decode(const OnlineStream *s) const
Decode one ready stream.
OnlineRecognizerResult GetResult(const OnlineStream *s) const
Return the current recognition result for a stream.
static OnlineRecognizer Create(const OnlineRecognizerConfig &config)
Create a streaming recognizer from a config struct.
void Reset(const OnlineStream *s) const
Reset a stream after endpointing or utterance completion.
OnlineStream CreateStream(const std::string &hotwords) const
Create a stream with inline hotwords.
void Destroy(const SherpaOnnxOnlineRecognizer *p) const
Destroy the wrapped C handle.
bool IsReady(const OnlineStream *s) const
Check whether the given stream has enough data to decode.
bool IsEndpoint(const OnlineStream *s) const
Check whether endpointing has triggered for a stream.
OnlineStream CreateStream() const
Create a stream that uses the recognizer's configured hotwords.
void Decode(const OnlineStream *ss, int32_t n) const
Decode multiple ready streams in parallel.
RAII wrapper for online speech denoising.
Definition cxx-api.h:1286
int32_t GetFrameShiftInSamples() const
Return the recommended frame shift in samples for streaming input.
DenoisedAudio Flush() const
Flush buffered audio and reset the denoiser.
DenoisedAudio Run(const float *samples, int32_t n, int32_t sample_rate) const
Process one chunk of streaming audio.
int32_t GetSampleRate() const
Return the expected input sample rate.
void Destroy(const SherpaOnnxOnlineSpeechDenoiser *p) const
Destroy the wrapped C handle.
void Reset() const
Reset the denoiser for a new stream.
static OnlineSpeechDenoiser Create(const OnlineSpeechDenoiserConfig &config)
Create an online speech denoiser.
void InputFinished() const
Indicate that no more input audio will be provided.
void AcceptWaveform(int32_t sample_rate, const float *samples, int32_t n) const
Append audio samples to the stream.
void Destroy(const SherpaOnnxOnlineStream *p) const
Destroy the wrapped C handle.
int32_t HasOption(const char *key) const
Check whether a per-stream option exists.
OnlineStream(const SherpaOnnxOnlineStream *p)
Wrap an existing C online stream handle.
void SetOption(const char *key, const char *value) const
Set a per-stream string option.
const char * GetOption(const char *key) const
Get a per-stream string option.
RAII wrapper for speaker embedding extraction.
Definition cxx-api.h:1818
bool IsReady(const OnlineStream *s) const
Check whether enough audio has been provided.
void Destroy(const SherpaOnnxSpeakerEmbeddingExtractor *p) const
Destroy the wrapped C handle.
OnlineStream CreateStream() const
Create a stream for embedding extraction.
int32_t Dim() const
Return the embedding dimension.
static SpeakerEmbeddingExtractor Create(const SpeakerEmbeddingExtractorConfig &config)
Create a speaker embedding extractor.
std::vector< float > ComputeEmbedding(const OnlineStream *s) const
Compute the embedding for a stream.
RAII wrapper for speaker embedding management.
Definition cxx-api.h:1859
bool Contains(const std::string &name) const
Check whether a speaker is enrolled.
bool AddListFlattened(const std::string &name, const float *v, int32_t n) const
Add multiple enrollment embeddings packed in one flat array.
std::vector< std::string > GetAllSpeakers() const
Return all enrolled speaker names.
bool Add(const std::string &name, const float *v) const
Add one enrollment embedding for a speaker.
int32_t NumSpeakers() const
Return the number of enrolled speakers.
std::vector< SpeakerMatch > GetBestMatches(const float *v, float threshold, int32_t n) const
Return up to n best matches above a similarity threshold.
bool Remove(const std::string &name) const
Remove a speaker from the manager.
bool AddList(const std::string &name, const float **v) const
Add multiple enrollment embeddings for one speaker.
void Destroy(const SherpaOnnxSpeakerEmbeddingManager *p) const
Destroy the wrapped C handle.
std::string Search(const float *v, float threshold) const
Search for the best matching enrolled speaker.
bool Verify(const std::string &name, const float *v, float threshold) const
Verify whether a query embedding matches a named speaker.
static SpeakerEmbeddingManager Create(int32_t dim)
Create a speaker embedding manager.
RAII wrapper for spoken language identification.
Definition cxx-api.h:1779
OfflineStream CreateStream() const
Create an offline stream for identification.
static SpokenLanguageIdentification Create(const SpokenLanguageIdentificationConfig &config)
Create a spoken language identifier.
SpokenLanguageIdentificationResult Compute(const OfflineStream *s) const
Run spoken language identification on a stream.
void Destroy(const SherpaOnnxSpokenLanguageIdentification *p) const
Destroy the wrapped C handle.
RAII wrapper for voice activity detection.
Definition cxx-api.h:1419
void Clear() const
Remove all queued speech segments.
void Pop() const
Remove the front queued speech segment.
void Destroy(const SherpaOnnxVoiceActivityDetector *p) const
Destroy the wrapped C handle.
std::shared_ptr< SpeechSegment > FrontPtr() const
Like Front(), but returns the segment in a shared pointer.
static VoiceActivityDetector Create(const VadModelConfig &config, float buffer_size_in_seconds)
Create a VAD instance.
void AcceptWaveform(const float *samples, int32_t n) const
Feed more audio samples to the detector.
bool IsEmpty() const
Check whether no speech segments are currently queued.
bool IsDetected() const
Check whether speech is currently detected.
void Flush() const
Flush buffered context at end of input.
SpeechSegment Front() const
Return the front queued speech segment.
void Reset() const
Reset the detector state.
std::string GetGitSha1()
Return the build Git SHA1 as a C++ string.
int32_t(*)(const float *samples, int32_t num_samples, float progress, void *arg) OfflineTtsCallback
TTS progress callback.
Definition cxx-api.h:1066
std::string GetVersionStr()
Return the sherpa-onnx version string as a C++ string.
Wave ReadWave(const std::string &filename)
Read a mono WAVE file into a C++ value object.
std::string GetGitDate()
Return the build Git date as a C++ string.
std::function< void(int32_t num_processed_chunks, int32_t num_total_chunks)> OfflineSpeakerDiarizationProgressCallback
Progress callback for offline speaker diarization.
Definition cxx-api.h:1960
bool FileExists(const std::string &filename)
Return true if a file exists.
bool WriteWave(const std::string &filename, const Wave &wave)
Write a mono WAVE file from a C++ value object.
One audio-tagging event returned by the C++ wrapper.
Definition cxx-api.h:1615
Configuration for audio tagging.
Definition cxx-api.h:1605
AudioTaggingModelConfig model
Definition cxx-api.h:1607
Audio-tagging model configuration.
Definition cxx-api.h:1591
OfflineZipformerAudioTaggingModelConfig zipformer
Definition cxx-api.h:1593
Denoised waveform returned by speech enhancement wrappers.
Definition cxx-api.h:1250
std::vector< float > samples
Definition cxx-api.h:1252
Fast clustering configuration.
Definition cxx-api.h:1926
Feature extraction settings shared by ASR and KWS wrappers.
Definition cxx-api.h:152
Generated audio returned by the C++ TTS wrapper.
Definition cxx-api.h:1052
std::vector< float > samples
Definition cxx-api.h:1054
Generation-time options for advanced TTS synthesis.
Definition cxx-api.h:1017
std::unordered_map< std::string, std::string > extra
Definition cxx-api.h:1034
std::vector< float > reference_audio
Definition cxx-api.h:1025
Homophone replacement resources used by some Chinese ASR setups.
Definition cxx-api.h:168
Current keyword spotting result copied into C++ containers.
Definition cxx-api.h:1143
std::vector< float > timestamps
Definition cxx-api.h:1149
std::vector< std::string > tokens
Definition cxx-api.h:1147
Configuration for the C++ keyword spotting wrapper.
Definition cxx-api.h:1157
Offline Canary model configuration.
Definition cxx-api.h:466
Offline Cohere Transcribe model configuration.
Definition cxx-api.h:480
Offline Dolphin model file.
Definition cxx-api.h:524
Offline FireRed ASR CTC model file.
Definition cxx-api.h:502
Offline FireRed ASR model files.
Definition cxx-api.h:494
Offline FunASR Nano model configuration.
Definition cxx-api.h:568
Optional language-model rescoring configuration for offline ASR.
Definition cxx-api.h:685
Offline MedASR CTC model file.
Definition cxx-api.h:548
Acoustic model configuration for offline ASR.
Definition cxx-api.h:628
OfflineMedAsrCtcModelConfig medasr
Definition cxx-api.h:673
OfflineTdnnModelConfig tdnn
Definition cxx-api.h:638
OfflineParaformerModelConfig paraformer
Definition cxx-api.h:632
OfflineSenseVoiceModelConfig sense_voice
Definition cxx-api.h:657
OfflineZipformerCtcModelConfig zipformer_ctc
Definition cxx-api.h:665
OfflineFireRedAsrCtcModelConfig fire_red_asr_ctc
Definition cxx-api.h:677
OfflineMoonshineModelConfig moonshine
Definition cxx-api.h:659
OfflineCanaryModelConfig canary
Definition cxx-api.h:667
OfflineWhisperModelConfig whisper
Definition cxx-api.h:636
OfflineCohereTranscribeModelConfig cohere_transcribe
Definition cxx-api.h:681
OfflineDolphinModelConfig dolphin
Definition cxx-api.h:663
OfflineWenetCtcModelConfig wenet_ctc
Definition cxx-api.h:669
OfflineFunASRNanoModelConfig funasr_nano
Definition cxx-api.h:675
OfflineFireRedAsrModelConfig fire_red_asr
Definition cxx-api.h:661
OfflineTransducerModelConfig transducer
Definition cxx-api.h:630
OfflineNemoEncDecCtcModelConfig nemo_ctc
Definition cxx-api.h:634
OfflineQwen3ASRModelConfig qwen3_asr
Definition cxx-api.h:679
OfflineOmnilingualAsrCtcModelConfig omnilingual
Definition cxx-api.h:671
Offline Moonshine model configuration.
Definition cxx-api.h:554
Offline NeMo EncDec CTC model file.
Definition cxx-api.h:442
Offline omnilingual ASR CTC model file.
Definition cxx-api.h:542
Offline Paraformer model file.
Definition cxx-api.h:436
Configuration for offline punctuation.
Definition cxx-api.h:1514
OfflinePunctuationModelConfig model
Definition cxx-api.h:1516
Offline punctuation model configuration.
Definition cxx-api.h:1502
Offline Qwen3-ASR model configuration.
Definition cxx-api.h:598
Configuration for offline ASR.
Definition cxx-api.h:724
Offline ASR result copied into C++ containers.
Definition cxx-api.h:753
std::vector< std::string > tokens
Definition cxx-api.h:759
Offline SenseVoice model configuration.
Definition cxx-api.h:514
Configuration for offline source separation.
Definition cxx-api.h:1690
OfflineSourceSeparationModelConfig model
Definition cxx-api.h:1692
Source-separation model configuration.
Definition cxx-api.h:1676
OfflineSourceSeparationUvrModelConfig uvr
Definition cxx-api.h:1680
OfflineSourceSeparationSpleeterModelConfig spleeter
Definition cxx-api.h:1678
Spleeter source-separation model configuration.
Definition cxx-api.h:1658
UVR (MDX-Net) source-separation model configuration.
Definition cxx-api.h:1666
Configuration for offline speaker diarization.
Definition cxx-api.h:1935
OfflineSpeakerSegmentationModelConfig segmentation
Definition cxx-api.h:1937
SpeakerEmbeddingExtractorConfig embedding
Definition cxx-api.h:1939
Segmentation model configuration for offline speaker diarization.
Definition cxx-api.h:1914
OfflineSpeakerSegmentationPyannoteModelConfig pyannote
Definition cxx-api.h:1916
Pyannote segmentation model configuration.
Definition cxx-api.h:1908
Configuration for offline speech denoising.
Definition cxx-api.h:1244
OfflineSpeechDenoiserModelConfig model
Definition cxx-api.h:1246
DPDFNet speech denoiser model configuration.
Definition cxx-api.h:1219
GTCRN speech denoiser model configuration.
Definition cxx-api.h:1213
Speech denoiser model configuration.
Definition cxx-api.h:1230
OfflineSpeechDenoiserGtcrnModelConfig gtcrn
Definition cxx-api.h:1232
OfflineSpeechDenoiserDpdfNetModelConfig dpdfnet
Definition cxx-api.h:1234
Offline transducer model files.
Definition cxx-api.h:426
Configuration for offline TTS.
Definition cxx-api.h:1038
OfflineTtsModelConfig model
Definition cxx-api.h:1040
Model configuration for offline TTS.
Definition cxx-api.h:992
OfflineTtsKittenModelConfig kitten
Definition cxx-api.h:1000
OfflineTtsSupertonicModelConfig supertonic
Definition cxx-api.h:1006
OfflineTtsVitsModelConfig vits
Definition cxx-api.h:994
OfflineTtsKokoroModelConfig kokoro
Definition cxx-api.h:998
OfflineTtsMatchaModelConfig matcha
Definition cxx-api.h:996
OfflineTtsPocketModelConfig pocket
Definition cxx-api.h:1004
OfflineTtsZipvoiceModelConfig zipvoice
Definition cxx-api.h:1002
Pocket TTS model configuration.
Definition cxx-api.h:948
Supertonic model configuration.
Definition cxx-api.h:969
ZipVoice model configuration.
Definition cxx-api.h:923
Offline WeNet CTC model file.
Definition cxx-api.h:536
Offline Whisper model configuration.
Definition cxx-api.h:448
Zipformer audio-tagging model configuration.
Definition cxx-api.h:1580
Offline Zipformer CTC model file.
Definition cxx-api.h:530
Decoder graph configuration for online CTC + FST decoding.
Definition cxx-api.h:160
Acoustic model configuration for streaming ASR.
Definition cxx-api.h:121
OnlineNemoCtcModelConfig nemo_ctc
Definition cxx-api.h:129
OnlineZipformer2CtcModelConfig zipformer2_ctc
Definition cxx-api.h:127
OnlineToneCtcModelConfig t_one_ctc
Definition cxx-api.h:131
OnlineParaformerModelConfig paraformer
Definition cxx-api.h:125
OnlineTransducerModelConfig transducer
Definition cxx-api.h:123
Streaming NeMo CTC model file.
Definition cxx-api.h:85
Streaming Paraformer model files.
Definition cxx-api.h:71
Configuration for online punctuation.
Definition cxx-api.h:1554
OnlinePunctuationModelConfig model
Definition cxx-api.h:1556
Online punctuation model configuration.
Definition cxx-api.h:1540
Configuration for streaming ASR.
Definition cxx-api.h:200
OnlineCtcFstDecoderConfig ctc_fst_decoder_config
Definition cxx-api.h:231
Current streaming ASR result copied into C++ containers.
Definition cxx-api.h:246
std::vector< std::string > tokens
Definition cxx-api.h:250
Configuration for online speech denoising.
Definition cxx-api.h:1279
OfflineSpeechDenoiserModelConfig model
Definition cxx-api.h:1281
Streaming T-One CTC model file.
Definition cxx-api.h:91
Streaming transducer model files.
Definition cxx-api.h:61
Streaming Zipformer2 CTC model file.
Definition cxx-api.h:79
Silero VAD model configuration.
Definition cxx-api.h:1319
Output of a source-separation run.
Definition cxx-api.h:1702
std::vector< SourceSeparationStem > stems
Definition cxx-api.h:1704
A single stem (output track) with one or more channels.
Definition cxx-api.h:1696
std::vector< std::vector< float > > samples
Definition cxx-api.h:1698
Configuration for speaker embedding extraction.
Definition cxx-api.h:1804
One speaker match returned by the best-matches API.
Definition cxx-api.h:1849
One speech segment produced by the VAD wrapper.
Definition cxx-api.h:1373
std::vector< float > samples
Definition cxx-api.h:1377
Configuration for spoken language identification.
Definition cxx-api.h:1759
SpokenLanguageIdentificationWhisperConfig whisper
Definition cxx-api.h:1761
Result of spoken language identification.
Definition cxx-api.h:1771
Whisper model configuration for spoken language identification.
Definition cxx-api.h:1749
Ten VAD model configuration.
Definition cxx-api.h:1335
VAD model configuration.
Definition cxx-api.h:1356
SileroVadModelConfig silero_vad
Definition cxx-api.h:1358
Mono PCM waveform used by the helper I/O functions.
Definition cxx-api.h:258
std::vector< float > samples
Definition cxx-api.h:260