TTS: VITS (Chinese, AiShell3)

Generate speech with the VITS Chinese (icefall/aishell3) model. This model uses a lexicon and rule FSTs/FARs for Chinese text normalization. It supports both synchronous and asynchronous generation.

For model documentation, see VITS AiShell3.

Source files

Synchronous generation

 1// Copyright (c)  2024  Xiaomi Corporation
 2//
 3// Text-to-speech with the VITS Chinese (icefall/aishell3) model.
 4// Uses lexicon and rule FSTs/FARs for Chinese text normalization.
 5//
 6// Usage:
 7//   node tts_vits_zh_aishell3.js
 8//
 9const sherpa_onnx = require('sherpa-onnx-node');
10
11function createOfflineTts() {
12  const config = {
13    model: {
14      vits: {
15        model: './vits-icefall-zh-aishell3/model.onnx',
16        tokens: './vits-icefall-zh-aishell3/tokens.txt',
17        lexicon: './vits-icefall-zh-aishell3/lexicon.txt',
18      },
19      debug: true,
20      numThreads: 1,
21      provider: 'cpu',
22    },
23    maxNumSentences: 1,
24    ruleFsts:
25        './vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/number.fst,./vits-icefall-zh-aishell3/new_heteronym.fst',
26    ruleFars: './vits-icefall-zh-aishell3/rule.far',
27  };
28  return new sherpa_onnx.OfflineTts(config);
29}
30
31const tts = createOfflineTts();
32
33const text =
34    '他在长沙出生,长白山长大,去过长江,现在他是一个银行的行长,主管行政工作。有困难,请拨110,或者13020240513。今天是2024年5月13号, 他上个月的工资是12345块钱。';
35
36const generationConfig = new sherpa_onnx.GenerationConfig({
37  sid: 88,
38  speed: 1.0,
39  silenceScale: 0.2,
40});
41
42let start = Date.now();
43const audio = tts.generate({text: text, generationConfig});
44let stop = Date.now();
45const elapsed_seconds = (stop - start) / 1000;
46const duration = audio.samples.length / audio.sampleRate;
47const real_time_factor = elapsed_seconds / duration;
48console.log('Wave duration', duration.toFixed(3), 'seconds');
49console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
50console.log(
51    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
52    real_time_factor.toFixed(3));
53
54const filename = 'test-vits-zh-aishell3.wav';
55sherpa_onnx.writeWave(
56    filename, {samples: audio.samples, sampleRate: audio.sampleRate});
57
58console.log(`Saved to ${filename}`);

Asynchronous generation

 1// Copyright (c)  2026  Xiaomi Corporation
 2//
 3// Asynchronous text-to-speech with the VITS Chinese (AiShell3) model.
 4//
 5// Usage:
 6//   node tts_vits_zh_aishell3_async.js
 7//
 8const sherpa_onnx = require('sherpa-onnx-node');
 9
10async function createOfflineTts() {
11  const config = {
12    model: {
13      vits: {
14        model: './vits-icefall-zh-aishell3/model.onnx',
15        tokens: './vits-icefall-zh-aishell3/tokens.txt',
16        lexicon: './vits-icefall-zh-aishell3/lexicon.txt',
17      },
18      debug: false,
19      numThreads: 1,
20      provider: 'cpu',
21    },
22    maxNumSentences: 1,
23    ruleFsts:
24        './vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/number.fst,./vits-icefall-zh-aishell3/new_heteronym.fst',
25    ruleFars: './vits-icefall-zh-aishell3/rule.far',
26  };
27  return await sherpa_onnx.OfflineTts.createAsync(config);
28}
29
30async function main() {
31  const tts = await createOfflineTts();
32
33  const text =
34      '他在长沙出生,长白山长大,去过长江,现在他是一个银行的行长,主管行政工作。有困难,请拨110,或者13020240513。今天是2024年5月13号, 他上个月的工资是12345块钱。';
35
36  const generationConfig = new sherpa_onnx.GenerationConfig({
37    sid: 88,
38    speed: 1.0,
39    silenceScale: 0.2,
40  });
41
42  const start = Date.now();
43  const audio = await tts.generateAsync({
44    text,
45    enableExternalBuffer: true,
46    generationConfig,
47    onProgress: ({samples, progress}) => {
48      process.stdout.write(
49          `Progress: ${(progress * 100).toFixed(1)}%, ` +
50          `Samples: ${samples.length}\r`);
51      return 1;
52    },
53  });
54
55  console.log('');
56  const stop = Date.now();
57  const elapsed_seconds = (stop - start) / 1000;
58  const duration = audio.samples.length / audio.sampleRate;
59  const real_time_factor = elapsed_seconds / duration;
60  console.log('Wave duration', duration.toFixed(3), 'seconds');
61  console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
62  console.log(
63      `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
64      real_time_factor.toFixed(3));
65
66  const filename = 'test-vits-zh-aishell3-async.wav';
67  sherpa_onnx.writeWave(
68      filename, {samples: audio.samples, sampleRate: audio.sampleRate});
69  console.log(`Saved to ${filename}`);
70}
71
72main().catch((err) => {
73  console.error('Error:', err);
74});

How to run

  1. Install the package:

    npm install sherpa-onnx-node
    
  2. Download the model:

    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
    tar xvf vits-icefall-zh-aishell3.tar.bz2
    
  3. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    # Choose one:
    node tts_vits_zh_aishell3.js
    node tts_vits_zh_aishell3_async.js
    

Notes

  • The config key is vits with fields: model, tokens, lexicon.

  • lexicon maps Chinese characters to phonemes.

  • ruleFsts contains comma-separated FST files for text normalization: date.fst, phone.fst, number.fst, new_heteronym.fst.

  • ruleFars contains a FAR file for additional rules.

  • The model has 150 speakers. sid: 88 selects a specific speaker.

  • The example text contains dates, phone numbers, and monetary amounts that are normalized by the rule FSTs.

  • The sync API uses new sherpa_onnx.OfflineTts(config) and tts.generate({text, generationConfig}).

  • The async API uses OfflineTts.createAsync() and tts.generateAsync() with an onProgress callback.