TTS: Supertonic

Generate speech with the Supertonic 3 model. Supertonic supports 31 languages and provides sync, async, and real-time playback modes.

Source files

Sync: test_tts_non_streaming_supertonic_en.js
Async: test_tts_non_streaming_supertonic_en_async.js
Play async: test_tts_non_streaming_supertonic_en_play_async.js

Synchronous generation

// Copyright (c)  2026  Xiaomi Corporation
//
// Synchronous text-to-speech with the Supertonic model.
//
// Usage:
//   node offline_tts_sync.js
//
const sherpa_onnx = require('sherpa-onnx-node');

function createOfflineTts() {
  const config = {
    model: {
      // Replace the paths below with the actual paths to your model files.
      supertonic: {
        durationPredictor:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
        textEncoder:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
        vectorEstimator:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
        vocoder:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
        ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
        unicodeIndexer:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
        voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
      },
      debug: true,
      numThreads: 2,
      provider: 'cpu',
    },
    maxNumSentences: 1,
  };
  return new sherpa_onnx.OfflineTts(config);
}

const tts = createOfflineTts();

const text =
    'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

// GenerationConfig controls speaker ID, speed, number of diffusion steps,
// and language. The `extra.lang` field uses ISO 639-1 codes.
const generationConfig = new sherpa_onnx.GenerationConfig({
  sid: 6,           // speaker ID, valid range [0, 9]
  speed: 1.25,      // speech speed, 1.0 is normal
  numSteps: 8,      // number of diffusion steps
  extra: {lang: 'en'},  // language code
});

let start = Date.now();
const audio = tts.generate({text, generationConfig});
let stop = Date.now();

const elapsed_seconds = (stop - start) / 1000;
const duration = audio.samples.length / audio.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'seconds');
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
console.log(
    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
    real_time_factor.toFixed(3));

const filename = 'test-supertonic-en.wav';
sherpa_onnx.writeWave(
    filename, {samples: audio.samples, sampleRate: audio.sampleRate});

console.log(`Saved to ${filename}`);

Asynchronous generation

// Copyright (c)  2026  Xiaomi Corporation
//
// Asynchronous text-to-speech with the Supertonic model.
// Uses createAsync() and generateAsync() for non-blocking generation
// with a progress callback.
//
// Usage:
//   node offline_tts_async.js
//
const sherpa_onnx = require('sherpa-onnx-node');

async function createOfflineTts() {
  const config = {
    model: {
      supertonic: {
        durationPredictor:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
        textEncoder:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
        vectorEstimator:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
        vocoder:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
        ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
        unicodeIndexer:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
        voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
      },
      debug: false,
      numThreads: 2,
      provider: 'cpu',
    },
    maxNumSentences: 1,
  };

  // createAsync() returns a Promise that resolves to an OfflineTts instance.
  return await sherpa_onnx.OfflineTts.createAsync(config);
}

async function generateAudioAsync(tts, text) {
  const generationConfig = new sherpa_onnx.GenerationConfig({
    sid: 6,
    speed: 1.25,
    numSteps: 8,
    extra: {lang: 'en'},
  });

  console.log('Starting generation...');

  // generateAsync() returns a Promise. The onProgress callback is invoked
  // with {samples, progress} after each chunk is generated.
  // Return a truthy value (e.g. 1) to continue, or a falsy value to cancel.
  const audio = await tts.generateAsync({
    text,
    enableExternalBuffer: true,
    generationConfig,
    onProgress: ({samples, progress}) => {
      process.stdout.write(
          `Progress: ${(progress * 100).toFixed(1)}%, ` +
          `Samples: ${samples.length}\r`);
      return 1;  // continue generation
    },
  });

  console.log('\nGeneration complete!');
  return audio;
}

async function main() {
  console.log('Creating OfflineTts...');
  const tts = await createOfflineTts();
  console.log('OfflineTts created!');

  const text =
      'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

  const start = Date.now();
  const audio = await generateAudioAsync(tts, text);
  const stop = Date.now();

  const elapsed_seconds = (stop - start) / 1000;
  const duration = audio.samples.length / audio.sampleRate;
  const real_time_factor = elapsed_seconds / duration;

  console.log('Wave duration', duration.toFixed(3), 'seconds');
  console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
  console.log(
      `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
      real_time_factor.toFixed(3));

  const filename = 'test-supertonic-en-async.wav';
  sherpa_onnx.writeWave(filename, {
    samples: audio.samples,
    sampleRate: audio.sampleRate,
  });
  console.log(`Saved to ${filename}`);
}

main().catch((err) => {
  console.error('Error:', err);
});

Asynchronous generation with real-time playback

// Copyright (c)  2026  Xiaomi Corporation
//
// Asynchronous text-to-speech with real-time playback using the speaker
// npm package. Audio chunks are played as they are generated.
//
// Usage:
//   npm install speaker
//   node offline_tts_play_async.js
//
const Speaker = require('speaker');
const sherpa_onnx = require('sherpa-onnx-node');

async function createOfflineTts() {
  const config = {
    model: {
      supertonic: {
        durationPredictor:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/duration_predictor.int8.onnx',
        textEncoder:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/text_encoder.int8.onnx',
        vectorEstimator:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vector_estimator.int8.onnx',
        vocoder:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/vocoder.int8.onnx',
        ttsJson: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/tts.json',
        unicodeIndexer:
            './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/unicode_indexer.bin',
        voiceStyle: './sherpa-onnx-supertonic-3-tts-int8-2026-05-11/voice.bin',
      },
      debug: false,
      numThreads: 2,
      provider: 'cpu',
    },
    maxNumSentences: 1,
  };

  return await sherpa_onnx.OfflineTts.createAsync(config);
}

function createSpeaker(sampleRate) {
  return new Speaker({
    channels: 1,
    bitDepth: 16,
    sampleRate: sampleRate,
    signed: true,
  });
}

// Convert Float32 samples [-1.0, 1.0] to Int16 buffer for the speaker.
function float32ToInt16Buffer(samples) {
  const buffer = Buffer.alloc(samples.length * 2);

  for (let i = 0; i < samples.length; ++i) {
    const s = Math.max(-1, Math.min(1, samples[i]));
    const v = s < 0 ? s * 0x8000 : s * 0x7fff;
    buffer.writeInt16LE(Math.round(v), i * 2);
  }

  return buffer;
}

function waitForEvent(emitter, eventName) {
  return new Promise((resolve, reject) => {
    emitter.once(eventName, resolve);
    emitter.once('error', reject);
  });
}

async function generateAudioAsync(tts, text) {
  const generationConfig = new sherpa_onnx.GenerationConfig({
    sid: 6,
    speed: 1.25,
    numSteps: 8,
    extra: {lang: 'en'},
  });

  const speaker = createSpeaker(tts.sampleRate);
  const start = Date.now();

  console.log('Starting generation and playback...');

  // Each onProgress callback receives a chunk of generated audio.
  // We convert it to Int16 and pipe it to the speaker for immediate playback.
  const audio = await tts.generateAsync({
    text,
    enableExternalBuffer: true,
    generationConfig,
    onProgress: ({samples, progress}) => {
      process.stdout.write(
          `Progress: ${(progress * 100).toFixed(1)}%, ` +
          `Chunk samples: ${samples.length}\r`);
      speaker.write(float32ToInt16Buffer(samples));
      return 1;
    },
  });

  const generationStop = Date.now();
  speaker.end();
  await waitForEvent(speaker, 'close');
  const playbackStop = Date.now();

  console.log('\nGeneration and playback complete!');
  return {
    audio,
    generationElapsedSeconds: (generationStop - start) / 1000,
    playbackElapsedSeconds: (playbackStop - start) / 1000,
  };
}

async function main() {
  console.log('Creating OfflineTts...');
  const tts = await createOfflineTts();
  console.log('OfflineTts created!');

  const text =
      'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

  const {audio, generationElapsedSeconds, playbackElapsedSeconds} =
      await generateAudioAsync(tts, text);
  const duration = audio.samples.length / audio.sampleRate;
  const real_time_factor = generationElapsedSeconds / duration;

  console.log('Wave duration', duration.toFixed(3), 'seconds');
  console.log(
      'Generation elapsed', generationElapsedSeconds.toFixed(3), 'seconds');
  console.log(
      'Playback drained in', playbackElapsedSeconds.toFixed(3), 'seconds');
  console.log(
      `RTF = ${generationElapsedSeconds.toFixed(3)}/${duration.toFixed(3)} =`,
      real_time_factor.toFixed(3));

  const filename = 'test-supertonic-en-play-async.wav';
  sherpa_onnx.writeWave(filename, {
    samples: audio.samples,
    sampleRate: audio.sampleRate,
  });
  console.log(`Saved to ${filename}`);
}

main().catch((err) => {
  console.error('Error:', err);
});

How to run

Install the packages:

npm install sherpa-onnx-node
npm install speaker  # only needed for play_async

Download the model:

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2
tar xf sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2
rm sherpa-onnx-supertonic-3-tts-int8-2026-05-11.tar.bz2

Set the library path and run:

# macOS
export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH

# Linux
export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH

# Choose one:
node offline_tts_sync.js
node offline_tts_async.js
node offline_tts_play_async.js

Notes

The config key is supertonic with 7 model files: durationPredictor, textEncoder, vectorEstimator, vocoder, ttsJson, unicodeIndexer, voiceStyle.
GenerationConfig fields for Supertonic: - sid: Speaker ID (range 0-9). - speed: Speech speed (1.0 = normal). - numSteps: Number of diffusion steps (e.g., 8). - extra.lang: ISO 639-1 language code. Supported: ar, bg,

cs, da, de, el, en, es, et, fi, fr, hi, hr, hu, id, it, ja, ko, lt, lv, nl, pl, pt, ro, ru, sk, sl, sv, tr, uk, vi.
The async API uses OfflineTts.createAsync() and tts.generateAsync() with an onProgress callback.
The play_async mode pipes audio chunks to the speaker npm package for immediate playback during generation.