TTS: Pocket (Voice Cloning)

Generate speech with the Pocket TTS model using voice cloning. Pocket uses a reference audio clip to clone the speaker’s voice for the generated speech.

Source file

nodejs-addon-examples/test_tts_non_streaming_pocket_en.js

Code

// Copyright (c)  2026  Xiaomi Corporation
//
// Text-to-speech with the Pocket TTS model (voice cloning).
// Uses a reference audio to clone the speaker's voice.
//
// Usage:
//   node tts_pocket_sync.js
//
const sherpa_onnx = require('sherpa-onnx-node');

function createOfflineTts() {
  const config = {
    model: {
      pocket: {
        lmFlow: './sherpa-onnx-pocket-tts-int8-2026-01-26/lm_flow.int8.onnx',
        lmMain: './sherpa-onnx-pocket-tts-int8-2026-01-26/lm_main.int8.onnx',
        encoder: './sherpa-onnx-pocket-tts-int8-2026-01-26/encoder.onnx',
        decoder: './sherpa-onnx-pocket-tts-int8-2026-01-26/decoder.int8.onnx',
        textConditioner:
            './sherpa-onnx-pocket-tts-int8-2026-01-26/text_conditioner.onnx',
        vocabJson: './sherpa-onnx-pocket-tts-int8-2026-01-26/vocab.json',
        tokenScoresJson:
            './sherpa-onnx-pocket-tts-int8-2026-01-26/token_scores.json',
        voiceEmbeddingCacheCapacity: 50,
      },
      debug: true,
      numThreads: 2,
      provider: 'cpu',
    },
    maxNumSentences: 1,
  };
  return new sherpa_onnx.OfflineTts(config);
}

const tts = createOfflineTts();

const text =
    'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

// Pocket TTS uses reference audio for voice cloning.
const referenceAudioFilename =
    './sherpa-onnx-pocket-tts-int8-2026-01-26/test_wavs/bria.wav';
const referenceWave = sherpa_onnx.readWave(referenceAudioFilename);

const generationConfig = new sherpa_onnx.GenerationConfig({
  speed: 1.0,
  referenceAudio: referenceWave.samples,
  referenceSampleRate: referenceWave.sampleRate,
  numSteps: 5,
  extra: {max_reference_audio_len: 12, seed: 42}
});

let start = Date.now();
const audio = tts.generate({text, generationConfig});
let stop = Date.now();
const elapsed_seconds = (stop - start) / 1000;
const duration = audio.samples.length / audio.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'seconds');
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
console.log(
    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
    real_time_factor.toFixed(3));

const filename = 'test-pocket-bria.wav';
sherpa_onnx.writeWave(
    filename, {samples: audio.samples, sampleRate: audio.sampleRate});

console.log(`Saved to ${filename}`);

How to run

Install the package:
```
npm install sherpa-onnx-node
```

Download the model:

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-pocket-tts-int8-2026-01-26.tar.bz2
tar xf sherpa-onnx-pocket-tts-int8-2026-01-26.tar.bz2
rm sherpa-onnx-pocket-tts-int8-2026-01-26.tar.bz2

Set the library path and run:

# macOS
export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH

# Linux
export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH

node tts_pocket_sync.js

Notes

Pocket TTS uses voice cloning via referenceAudio in the GenerationConfig. Provide a WAV file of the target speaker.
The config key is pocket with fields: lmFlow, lmMain, encoder, decoder, textConditioner, vocabJson, tokenScoresJson, voiceEmbeddingCacheCapacity.
GenerationConfig fields for Pocket: - referenceAudio: Float32Array of the reference audio samples. - referenceSampleRate: Sample rate of the reference audio. - numSteps: Number of diffusion steps (e.g., 5). - extra.max_reference_audio_len: Max reference audio length in seconds. - extra.seed: Random seed for reproducibility.
Pocket also supports async generation with createAsync() and generateAsync(). See the async example and play async example.