TTS: ZipVoice (Voice Cloning)

Generate speech with the ZipVoice model using voice cloning. ZipVoice uses a reference audio clip and its transcript to clone the speaker’s voice.

Source files

Sync: test_tts_non_streaming_zipvoice_zh_en.js
Async: test_tts_non_streaming_zipvoice_zh_en_async.js
Play async: test_tts_non_streaming_zipvoice_zh_en_play_async.js

Synchronous generation

// Copyright (c)  2026  Xiaomi Corporation
//
// Text-to-speech with the ZipVoice model (voice cloning).
// Uses a reference audio and reference text to clone the speaker's voice.
//
// Usage:
//   node tts_zipvoice_sync.js
//
const sherpa_onnx = require('sherpa-onnx-node');

function createOfflineTts() {
  const config = {
    model: {
      zipvoice: {
        tokens: './sherpa-onnx-zipvoice-distill-int8-zh-en-emilia/tokens.txt',
        encoder:
            './sherpa-onnx-zipvoice-distill-int8-zh-en-emilia/encoder.int8.onnx',
        decoder:
            './sherpa-onnx-zipvoice-distill-int8-zh-en-emilia/decoder.int8.onnx',
        vocoder: './vocos_24khz.onnx',
        dataDir:
            './sherpa-onnx-zipvoice-distill-int8-zh-en-emilia/espeak-ng-data',
        lexicon: './sherpa-onnx-zipvoice-distill-int8-zh-en-emilia/lexicon.txt',
      },
      debug: true,
      numThreads: 2,
      provider: 'cpu',
    },
    maxNumSentences: 1,
  };
  return new sherpa_onnx.OfflineTts(config);
}

const tts = createOfflineTts();

const text =
    '小米的价值观是真诚, 热爱. 真诚，就是不欺人也不自欺. 热爱, 就是全心投入并享受其中.';

// ZipVoice requires a reference audio and its transcript for voice cloning.
const referenceText =
    '那还是三十六年前, 一九八七年. 我呢考上了武汉大学的计算机系.';
const referenceAudioFilename =
    './sherpa-onnx-zipvoice-distill-int8-zh-en-emilia/test_wavs/leijun-1.wav';
const referenceWave = sherpa_onnx.readWave(referenceAudioFilename);

const generationConfig = new sherpa_onnx.GenerationConfig({
  speed: 1.0,
  referenceAudio: referenceWave.samples,
  referenceSampleRate: referenceWave.sampleRate,
  referenceText,
  numSteps: 4,
  extra: {min_char_in_sentence: 10},
});

let start = Date.now();
const audio = tts.generate({text, generationConfig});
let stop = Date.now();
const elapsed_seconds = (stop - start) / 1000;
const duration = audio.samples.length / audio.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'seconds');
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
console.log(
    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
    real_time_factor.toFixed(3));

const filename = 'test-zipvoice-zh-en.wav';
sherpa_onnx.writeWave(
    filename, {samples: audio.samples, sampleRate: audio.sampleRate});

console.log(`Saved to ${filename}`);

How to run

Install the packages:

npm install sherpa-onnx-node
npm install speaker  # only needed for play_async

Download the model and vocoder:

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-zipvoice-distill-int8-zh-en-emilia.tar.bz2
tar xf sherpa-onnx-zipvoice-distill-int8-zh-en-emilia.tar.bz2
rm sherpa-onnx-zipvoice-distill-int8-zh-en-emilia.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos_24khz.onnx

Set the library path and run:

# macOS
export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH

# Linux
export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH

node tts_zipvoice_sync.js

Notes

ZipVoice requires a reference audio AND its transcript for voice cloning. The GenerationConfig must include: - referenceAudio: Float32Array of the reference audio samples. - referenceSampleRate: Sample rate of the reference audio. - referenceText: Transcript of the reference audio. - numSteps: Number of diffusion steps (e.g., 4). - extra.min_char_in_sentence: Minimum characters per sentence.
The config key is zipvoice with fields: tokens, encoder, decoder, vocoder, dataDir, lexicon.
ZipVoice also supports async generation. See the async example and play async example.