TTS: Supertonic
Generate speech with the Supertonic 3 model. Supertonic supports 31 languages and provides sync, async, and real-time playback modes.
Source files
Synchronous generation
1// Copyright (c) 2026 Xiaomi Corporation
2//
3// Synchronous text-to-speech with the Supertonic model.
4//
5// Usage:
6// node offline_tts_sync.js
7//
8const sherpa_onnx = require('sherpa-onnx-node');
9
10function createOfflineTts() {
11 const config = {
12 model: {
13 // Replace the paths below with the actual paths to your model files.
14 supertonic: {
15 durationPredictor:
16 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
17 textEncoder:
18 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
19 vectorEstimator:
20 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
21 vocoder:
22 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
23 ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
24 unicodeIndexer:
25 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
26 voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
27 },
28 debug: true,
29 numThreads: 2,
30 provider: 'cpu',
31 },
32 maxNumSentences: 1,
33 };
34 return new sherpa_onnx.OfflineTts(config);
35}
36
37const tts = createOfflineTts();
38
39const text =
40 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
41
42// GenerationConfig controls speaker ID, speed, number of diffusion steps,
43// and language. The `extra.lang` field uses ISO 639-1 codes.
44const generationConfig = new sherpa_onnx.GenerationConfig({
45 sid: 6, // speaker ID, valid range [0, 9]
46 speed: 1.25, // speech speed, 1.0 is normal
47 numSteps: 8, // number of diffusion steps
48 extra: {lang: 'en'}, // language code
49});
50
51let start = Date.now();
52const audio = tts.generate({text, generationConfig});
53let stop = Date.now();
54
55const elapsed_seconds = (stop - start) / 1000;
56const duration = audio.samples.length / audio.sampleRate;
57const real_time_factor = elapsed_seconds / duration;
58console.log('Wave duration', duration.toFixed(3), 'seconds');
59console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
60console.log(
61 `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
62 real_time_factor.toFixed(3));
63
64const filename = 'test-supertonic-en.wav';
65sherpa_onnx.writeWave(
66 filename, {samples: audio.samples, sampleRate: audio.sampleRate});
67
68console.log(`Saved to ${filename}`);
Asynchronous generation
1// Copyright (c) 2026 Xiaomi Corporation
2//
3// Asynchronous text-to-speech with the Supertonic model.
4// Uses createAsync() and generateAsync() for non-blocking generation
5// with a progress callback.
6//
7// Usage:
8// node offline_tts_async.js
9//
10const sherpa_onnx = require('sherpa-onnx-node');
11
12async function createOfflineTts() {
13 const config = {
14 model: {
15 supertonic: {
16 durationPredictor:
17 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
18 textEncoder:
19 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
20 vectorEstimator:
21 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
22 vocoder:
23 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
24 ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
25 unicodeIndexer:
26 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
27 voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
28 },
29 debug: false,
30 numThreads: 2,
31 provider: 'cpu',
32 },
33 maxNumSentences: 1,
34 };
35
36 // createAsync() returns a Promise that resolves to an OfflineTts instance.
37 return await sherpa_onnx.OfflineTts.createAsync(config);
38}
39
40async function generateAudioAsync(tts, text) {
41 const generationConfig = new sherpa_onnx.GenerationConfig({
42 sid: 6,
43 speed: 1.25,
44 numSteps: 8,
45 extra: {lang: 'en'},
46 });
47
48 console.log('Starting generation...');
49
50 // generateAsync() returns a Promise. The onProgress callback is invoked
51 // with {samples, progress} after each chunk is generated.
52 // Return a truthy value (e.g. 1) to continue, or a falsy value to cancel.
53 const audio = await tts.generateAsync({
54 text,
55 enableExternalBuffer: true,
56 generationConfig,
57 onProgress: ({samples, progress}) => {
58 process.stdout.write(
59 `Progress: ${(progress * 100).toFixed(1)}%, ` +
60 `Samples: ${samples.length}\r`);
61 return 1; // continue generation
62 },
63 });
64
65 console.log('\nGeneration complete!');
66 return audio;
67}
68
69async function main() {
70 console.log('Creating OfflineTts...');
71 const tts = await createOfflineTts();
72 console.log('OfflineTts created!');
73
74 const text =
75 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
76
77 const start = Date.now();
78 const audio = await generateAudioAsync(tts, text);
79 const stop = Date.now();
80
81 const elapsed_seconds = (stop - start) / 1000;
82 const duration = audio.samples.length / audio.sampleRate;
83 const real_time_factor = elapsed_seconds / duration;
84
85 console.log('Wave duration', duration.toFixed(3), 'seconds');
86 console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
87 console.log(
88 `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
89 real_time_factor.toFixed(3));
90
91 const filename = 'test-supertonic-en-async.wav';
92 sherpa_onnx.writeWave(filename, {
93 samples: audio.samples,
94 sampleRate: audio.sampleRate,
95 });
96 console.log(`Saved to ${filename}`);
97}
98
99main().catch((err) => {
100 console.error('Error:', err);
101});
Asynchronous generation with real-time playback
1// Copyright (c) 2026 Xiaomi Corporation
2//
3// Asynchronous text-to-speech with real-time playback using the speaker
4// npm package. Audio chunks are played as they are generated.
5//
6// Usage:
7// npm install speaker
8// node offline_tts_play_async.js
9//
10const Speaker = require('speaker');
11const sherpa_onnx = require('sherpa-onnx-node');
12
13async function createOfflineTts() {
14 const config = {
15 model: {
16 supertonic: {
17 durationPredictor:
18 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
19 textEncoder:
20 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
21 vectorEstimator:
22 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
23 vocoder:
24 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
25 ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
26 unicodeIndexer:
27 './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
28 voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
29 },
30 debug: false,
31 numThreads: 2,
32 provider: 'cpu',
33 },
34 maxNumSentences: 1,
35 };
36
37 return await sherpa_onnx.OfflineTts.createAsync(config);
38}
39
40function createSpeaker(sampleRate) {
41 return new Speaker({
42 channels: 1,
43 bitDepth: 16,
44 sampleRate: sampleRate,
45 signed: true,
46 });
47}
48
49// Convert Float32 samples [-1.0, 1.0] to Int16 buffer for the speaker.
50function float32ToInt16Buffer(samples) {
51 const buffer = Buffer.alloc(samples.length * 2);
52
53 for (let i = 0; i < samples.length; ++i) {
54 const s = Math.max(-1, Math.min(1, samples[i]));
55 const v = s < 0 ? s * 0x8000 : s * 0x7fff;
56 buffer.writeInt16LE(Math.round(v), i * 2);
57 }
58
59 return buffer;
60}
61
62function waitForEvent(emitter, eventName) {
63 return new Promise((resolve, reject) => {
64 emitter.once(eventName, resolve);
65 emitter.once('error', reject);
66 });
67}
68
69async function generateAudioAsync(tts, text) {
70 const generationConfig = new sherpa_onnx.GenerationConfig({
71 sid: 6,
72 speed: 1.25,
73 numSteps: 8,
74 extra: {lang: 'en'},
75 });
76
77 const speaker = createSpeaker(tts.sampleRate);
78 const start = Date.now();
79
80 console.log('Starting generation and playback...');
81
82 // Each onProgress callback receives a chunk of generated audio.
83 // We convert it to Int16 and pipe it to the speaker for immediate playback.
84 const audio = await tts.generateAsync({
85 text,
86 enableExternalBuffer: true,
87 generationConfig,
88 onProgress: ({samples, progress}) => {
89 process.stdout.write(
90 `Progress: ${(progress * 100).toFixed(1)}%, ` +
91 `Chunk samples: ${samples.length}\r`);
92 speaker.write(float32ToInt16Buffer(samples));
93 return 1;
94 },
95 });
96
97 const generationStop = Date.now();
98 speaker.end();
99 await waitForEvent(speaker, 'close');
100 const playbackStop = Date.now();
101
102 console.log('\nGeneration and playback complete!');
103 return {
104 audio,
105 generationElapsedSeconds: (generationStop - start) / 1000,
106 playbackElapsedSeconds: (playbackStop - start) / 1000,
107 };
108}
109
110async function main() {
111 console.log('Creating OfflineTts...');
112 const tts = await createOfflineTts();
113 console.log('OfflineTts created!');
114
115 const text =
116 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
117
118 const {audio, generationElapsedSeconds, playbackElapsedSeconds} =
119 await generateAudioAsync(tts, text);
120 const duration = audio.samples.length / audio.sampleRate;
121 const real_time_factor = generationElapsedSeconds / duration;
122
123 console.log('Wave duration', duration.toFixed(3), 'seconds');
124 console.log(
125 'Generation elapsed', generationElapsedSeconds.toFixed(3), 'seconds');
126 console.log(
127 'Playback drained in', playbackElapsedSeconds.toFixed(3), 'seconds');
128 console.log(
129 `RTF = ${generationElapsedSeconds.toFixed(3)}/${duration.toFixed(3)} =`,
130 real_time_factor.toFixed(3));
131
132 const filename = 'test-supertonic-en-play-async.wav';
133 sherpa_onnx.writeWave(filename, {
134 samples: audio.samples,
135 sampleRate: audio.sampleRate,
136 });
137 console.log(`Saved to ${filename}`);
138}
139
140main().catch((err) => {
141 console.error('Error:', err);
142});
How to run
Install the packages:
npm install sherpa-onnx-node npm install speaker # only needed for play_async
Download the model:
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2 tar xf sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2 rm sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2
Set the library path and run:
# macOS export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH # Linux export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH # Choose one: node offline_tts_sync.js node offline_tts_async.js node offline_tts_play_async.js
Notes
The config key is
supertonicwith 7 model files:durationPredictor,textEncoder,vectorEstimator,vocoder,ttsJson,unicodeIndexer,voiceStyle.GenerationConfigfields for Supertonic: -sid: Speaker ID (range 0-9). -speed: Speech speed (1.0 = normal). -numSteps: Number of diffusion steps (e.g., 8). -extra.lang: ISO 639-1 language code. Supported:ar,bg,cs,da,de,el,en,es,et,fi,fr,hi,hr,hu,id,it,ja,ko,lt,lv,nl,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi.The async API uses
OfflineTts.createAsync()andtts.generateAsync()with anonProgresscallback.The play_async mode pipes audio chunks to the
speakernpm package for immediate playback during generation.