TTS: Supertonic

Generate speech with the Supertonic 3 model. Supertonic supports 31 languages and provides sync, async, and real-time playback modes.

Source files

Synchronous generation

 1// Copyright (c)  2026  Xiaomi Corporation
 2//
 3// Synchronous text-to-speech with the Supertonic model.
 4//
 5// Usage:
 6//   node offline_tts_sync.js
 7//
 8const sherpa_onnx = require('sherpa-onnx-node');
 9
10function createOfflineTts() {
11  const config = {
12    model: {
13      // Replace the paths below with the actual paths to your model files.
14      supertonic: {
15        durationPredictor:
16            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
17        textEncoder:
18            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
19        vectorEstimator:
20            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
21        vocoder:
22            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
23        ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
24        unicodeIndexer:
25            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
26        voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
27      },
28      debug: true,
29      numThreads: 2,
30      provider: 'cpu',
31    },
32    maxNumSentences: 1,
33  };
34  return new sherpa_onnx.OfflineTts(config);
35}
36
37const tts = createOfflineTts();
38
39const text =
40    'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
41
42// GenerationConfig controls speaker ID, speed, number of diffusion steps,
43// and language. The `extra.lang` field uses ISO 639-1 codes.
44const generationConfig = new sherpa_onnx.GenerationConfig({
45  sid: 6,           // speaker ID, valid range [0, 9]
46  speed: 1.25,      // speech speed, 1.0 is normal
47  numSteps: 8,      // number of diffusion steps
48  extra: {lang: 'en'},  // language code
49});
50
51let start = Date.now();
52const audio = tts.generate({text, generationConfig});
53let stop = Date.now();
54
55const elapsed_seconds = (stop - start) / 1000;
56const duration = audio.samples.length / audio.sampleRate;
57const real_time_factor = elapsed_seconds / duration;
58console.log('Wave duration', duration.toFixed(3), 'seconds');
59console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
60console.log(
61    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
62    real_time_factor.toFixed(3));
63
64const filename = 'test-supertonic-en.wav';
65sherpa_onnx.writeWave(
66    filename, {samples: audio.samples, sampleRate: audio.sampleRate});
67
68console.log(`Saved to ${filename}`);

Asynchronous generation

  1// Copyright (c)  2026  Xiaomi Corporation
  2//
  3// Asynchronous text-to-speech with the Supertonic model.
  4// Uses createAsync() and generateAsync() for non-blocking generation
  5// with a progress callback.
  6//
  7// Usage:
  8//   node offline_tts_async.js
  9//
 10const sherpa_onnx = require('sherpa-onnx-node');
 11
 12async function createOfflineTts() {
 13  const config = {
 14    model: {
 15      supertonic: {
 16        durationPredictor:
 17            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
 18        textEncoder:
 19            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
 20        vectorEstimator:
 21            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
 22        vocoder:
 23            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
 24        ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
 25        unicodeIndexer:
 26            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
 27        voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
 28      },
 29      debug: false,
 30      numThreads: 2,
 31      provider: 'cpu',
 32    },
 33    maxNumSentences: 1,
 34  };
 35
 36  // createAsync() returns a Promise that resolves to an OfflineTts instance.
 37  return await sherpa_onnx.OfflineTts.createAsync(config);
 38}
 39
 40async function generateAudioAsync(tts, text) {
 41  const generationConfig = new sherpa_onnx.GenerationConfig({
 42    sid: 6,
 43    speed: 1.25,
 44    numSteps: 8,
 45    extra: {lang: 'en'},
 46  });
 47
 48  console.log('Starting generation...');
 49
 50  // generateAsync() returns a Promise. The onProgress callback is invoked
 51  // with {samples, progress} after each chunk is generated.
 52  // Return a truthy value (e.g. 1) to continue, or a falsy value to cancel.
 53  const audio = await tts.generateAsync({
 54    text,
 55    enableExternalBuffer: true,
 56    generationConfig,
 57    onProgress: ({samples, progress}) => {
 58      process.stdout.write(
 59          `Progress: ${(progress * 100).toFixed(1)}%, ` +
 60          `Samples: ${samples.length}\r`);
 61      return 1;  // continue generation
 62    },
 63  });
 64
 65  console.log('\nGeneration complete!');
 66  return audio;
 67}
 68
 69async function main() {
 70  console.log('Creating OfflineTts...');
 71  const tts = await createOfflineTts();
 72  console.log('OfflineTts created!');
 73
 74  const text =
 75      'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
 76
 77  const start = Date.now();
 78  const audio = await generateAudioAsync(tts, text);
 79  const stop = Date.now();
 80
 81  const elapsed_seconds = (stop - start) / 1000;
 82  const duration = audio.samples.length / audio.sampleRate;
 83  const real_time_factor = elapsed_seconds / duration;
 84
 85  console.log('Wave duration', duration.toFixed(3), 'seconds');
 86  console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
 87  console.log(
 88      `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
 89      real_time_factor.toFixed(3));
 90
 91  const filename = 'test-supertonic-en-async.wav';
 92  sherpa_onnx.writeWave(filename, {
 93    samples: audio.samples,
 94    sampleRate: audio.sampleRate,
 95  });
 96  console.log(`Saved to ${filename}`);
 97}
 98
 99main().catch((err) => {
100  console.error('Error:', err);
101});

Asynchronous generation with real-time playback

  1// Copyright (c)  2026  Xiaomi Corporation
  2//
  3// Asynchronous text-to-speech with real-time playback using the speaker
  4// npm package. Audio chunks are played as they are generated.
  5//
  6// Usage:
  7//   npm install speaker
  8//   node offline_tts_play_async.js
  9//
 10const Speaker = require('speaker');
 11const sherpa_onnx = require('sherpa-onnx-node');
 12
 13async function createOfflineTts() {
 14  const config = {
 15    model: {
 16      supertonic: {
 17        durationPredictor:
 18            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
 19        textEncoder:
 20            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
 21        vectorEstimator:
 22            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
 23        vocoder:
 24            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
 25        ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
 26        unicodeIndexer:
 27            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
 28        voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
 29      },
 30      debug: false,
 31      numThreads: 2,
 32      provider: 'cpu',
 33    },
 34    maxNumSentences: 1,
 35  };
 36
 37  return await sherpa_onnx.OfflineTts.createAsync(config);
 38}
 39
 40function createSpeaker(sampleRate) {
 41  return new Speaker({
 42    channels: 1,
 43    bitDepth: 16,
 44    sampleRate: sampleRate,
 45    signed: true,
 46  });
 47}
 48
 49// Convert Float32 samples [-1.0, 1.0] to Int16 buffer for the speaker.
 50function float32ToInt16Buffer(samples) {
 51  const buffer = Buffer.alloc(samples.length * 2);
 52
 53  for (let i = 0; i < samples.length; ++i) {
 54    const s = Math.max(-1, Math.min(1, samples[i]));
 55    const v = s < 0 ? s * 0x8000 : s * 0x7fff;
 56    buffer.writeInt16LE(Math.round(v), i * 2);
 57  }
 58
 59  return buffer;
 60}
 61
 62function waitForEvent(emitter, eventName) {
 63  return new Promise((resolve, reject) => {
 64    emitter.once(eventName, resolve);
 65    emitter.once('error', reject);
 66  });
 67}
 68
 69async function generateAudioAsync(tts, text) {
 70  const generationConfig = new sherpa_onnx.GenerationConfig({
 71    sid: 6,
 72    speed: 1.25,
 73    numSteps: 8,
 74    extra: {lang: 'en'},
 75  });
 76
 77  const speaker = createSpeaker(tts.sampleRate);
 78  const start = Date.now();
 79
 80  console.log('Starting generation and playback...');
 81
 82  // Each onProgress callback receives a chunk of generated audio.
 83  // We convert it to Int16 and pipe it to the speaker for immediate playback.
 84  const audio = await tts.generateAsync({
 85    text,
 86    enableExternalBuffer: true,
 87    generationConfig,
 88    onProgress: ({samples, progress}) => {
 89      process.stdout.write(
 90          `Progress: ${(progress * 100).toFixed(1)}%, ` +
 91          `Chunk samples: ${samples.length}\r`);
 92      speaker.write(float32ToInt16Buffer(samples));
 93      return 1;
 94    },
 95  });
 96
 97  const generationStop = Date.now();
 98  speaker.end();
 99  await waitForEvent(speaker, 'close');
100  const playbackStop = Date.now();
101
102  console.log('\nGeneration and playback complete!');
103  return {
104    audio,
105    generationElapsedSeconds: (generationStop - start) / 1000,
106    playbackElapsedSeconds: (playbackStop - start) / 1000,
107  };
108}
109
110async function main() {
111  console.log('Creating OfflineTts...');
112  const tts = await createOfflineTts();
113  console.log('OfflineTts created!');
114
115  const text =
116      'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
117
118  const {audio, generationElapsedSeconds, playbackElapsedSeconds} =
119      await generateAudioAsync(tts, text);
120  const duration = audio.samples.length / audio.sampleRate;
121  const real_time_factor = generationElapsedSeconds / duration;
122
123  console.log('Wave duration', duration.toFixed(3), 'seconds');
124  console.log(
125      'Generation elapsed', generationElapsedSeconds.toFixed(3), 'seconds');
126  console.log(
127      'Playback drained in', playbackElapsedSeconds.toFixed(3), 'seconds');
128  console.log(
129      `RTF = ${generationElapsedSeconds.toFixed(3)}/${duration.toFixed(3)} =`,
130      real_time_factor.toFixed(3));
131
132  const filename = 'test-supertonic-en-play-async.wav';
133  sherpa_onnx.writeWave(filename, {
134    samples: audio.samples,
135    sampleRate: audio.sampleRate,
136  });
137  console.log(`Saved to ${filename}`);
138}
139
140main().catch((err) => {
141  console.error('Error:', err);
142});

How to run

  1. Install the packages:

    npm install sherpa-onnx-node
    npm install speaker  # only needed for play_async
    
  2. Download the model:

    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2
    tar xf sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2
    rm sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2
    
  3. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    # Choose one:
    node offline_tts_sync.js
    node offline_tts_async.js
    node offline_tts_play_async.js
    

Notes

  • The config key is supertonic with 7 model files: durationPredictor, textEncoder, vectorEstimator, vocoder, ttsJson, unicodeIndexer, voiceStyle.

  • GenerationConfig fields for Supertonic: - sid: Speaker ID (range 0-9). - speed: Speech speed (1.0 = normal). - numSteps: Number of diffusion steps (e.g., 8). - extra.lang: ISO 639-1 language code. Supported: ar, bg,

    cs, da, de, el, en, es, et, fi, fr, hi, hr, hu, id, it, ja, ko, lt, lv, nl, pl, pt, ro, ru, sk, sl, sv, tr, uk, vi.

  • The async API uses OfflineTts.createAsync() and tts.generateAsync() with an onProgress callback.

  • The play_async mode pipes audio chunks to the speaker npm package for immediate playback during generation.