TTS: Kokoro (Chinese + English)

Generate speech with the Kokoro multi-language (Chinese+English, v1.0) model. This model supports mixed Chinese-English text with multiple speaker voices and both synchronous and asynchronous generation.

For model documentation, see Kokoro Chinese+English.

Source files

Synchronous generation

 1// Copyright (c)  2025  Xiaomi Corporation
 2//
 3// Text-to-speech with the Kokoro multi-language (Chinese+English) model.
 4//
 5// Usage:
 6//   node tts_kokoro_zh_en.js
 7//
 8const sherpa_onnx = require('sherpa-onnx-node');
 9
10function createOfflineTts() {
11  const config = {
12    model: {
13      kokoro: {
14        model: './kokoro-multi-lang-v1_0/model.onnx',
15        voices: './kokoro-multi-lang-v1_0/voices.bin',
16        tokens: './kokoro-multi-lang-v1_0/tokens.txt',
17        dataDir: './kokoro-multi-lang-v1_0/espeak-ng-data',
18        // Multiple lexicon files are separated by commas.
19        lexicon:
20            './kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt',
21      },
22      debug: true,
23      numThreads: 1,
24      provider: 'cpu',
25    },
26    maxNumSentences: 1,
27  };
28  return new sherpa_onnx.OfflineTts(config);
29}
30
31const tts = createOfflineTts();
32
33const text =
34    '中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?';
35
36const generationConfig = new sherpa_onnx.GenerationConfig({
37  sid: 48,
38  speed: 1.0,
39  silenceScale: 0.2,
40});
41
42let start = Date.now();
43const audio = tts.generate({text, generationConfig});
44let stop = Date.now();
45const elapsed_seconds = (stop - start) / 1000;
46const duration = audio.samples.length / audio.sampleRate;
47const real_time_factor = elapsed_seconds / duration;
48console.log('Wave duration', duration.toFixed(3), 'seconds');
49console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
50console.log(
51    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
52    real_time_factor.toFixed(3));
53
54const filename = 'test-kokoro-zh-en.wav';
55sherpa_onnx.writeWave(
56    filename, {samples: audio.samples, sampleRate: audio.sampleRate});
57
58console.log(`Saved to ${filename}`);

Asynchronous generation

 1// Copyright (c)  2026  Xiaomi Corporation
 2//
 3// Asynchronous text-to-speech with the Kokoro Chinese+English model.
 4//
 5// Usage:
 6//   node tts_kokoro_zh_en_async.js
 7//
 8const sherpa_onnx = require('sherpa-onnx-node');
 9
10async function createOfflineTts() {
11  const config = {
12    model: {
13      kokoro: {
14        model: './kokoro-multi-lang-v1_0/model.onnx',
15        voices: './kokoro-multi-lang-v1_0/voices.bin',
16        tokens: './kokoro-multi-lang-v1_0/tokens.txt',
17        dataDir: './kokoro-multi-lang-v1_0/espeak-ng-data',
18        lexicon:
19            './kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt',
20      },
21      debug: false,
22      numThreads: 1,
23      provider: 'cpu',
24    },
25    maxNumSentences: 1,
26  };
27  return await sherpa_onnx.OfflineTts.createAsync(config);
28}
29
30async function main() {
31  const tts = await createOfflineTts();
32
33  const text =
34      '中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?';
35
36  const generationConfig = new sherpa_onnx.GenerationConfig({
37    sid: 48,
38    speed: 1.0,
39    silenceScale: 0.2,
40  });
41
42  const start = Date.now();
43  const audio = await tts.generateAsync({
44    text,
45    enableExternalBuffer: true,
46    generationConfig,
47    onProgress: ({samples, progress}) => {
48      process.stdout.write(
49          `Progress: ${(progress * 100).toFixed(1)}%, ` +
50          `Samples: ${samples.length}\r`);
51      return 1;
52    },
53  });
54
55  console.log('');
56  const stop = Date.now();
57  const elapsed_seconds = (stop - start) / 1000;
58  const duration = audio.samples.length / audio.sampleRate;
59  const real_time_factor = elapsed_seconds / duration;
60  console.log('Wave duration', duration.toFixed(3), 'seconds');
61  console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
62  console.log(
63      `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
64      real_time_factor.toFixed(3));
65
66  const filename = 'test-kokoro-zh-en-async.wav';
67  sherpa_onnx.writeWave(
68      filename, {samples: audio.samples, sampleRate: audio.sampleRate});
69  console.log(`Saved to ${filename}`);
70}
71
72main().catch((err) => {
73  console.error('Error:', err);
74});

How to run

  1. Install the package:

    npm install sherpa-onnx-node
    
  2. Download the model:

    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
    tar xf kokoro-multi-lang-v1_0.tar.bz2
    rm kokoro-multi-lang-v1_0.tar.bz2
    
  3. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    # Choose one:
    node tts_kokoro_zh_en.js
    node tts_kokoro_zh_en_async.js
    

Notes

  • This model uses the same kokoro config key as the English model, but adds a lexicon field with comma-separated lexicon files for each language (e.g., lexicon-us-en.txt,lexicon-zh.txt).

  • sid: 48 selects a specific speaker voice. The multi-lang model has more speakers than the English-only model.

  • The sync API uses new sherpa_onnx.OfflineTts(config) and tts.generate({text, generationConfig}).

  • The async API uses OfflineTts.createAsync() and tts.generateAsync() with an onProgress callback.

  • For English-only, see TTS: Kokoro (English).