Spoken Language Identification

Identify the language spoken in a WAV file using a Whisper multilingual model. This example classifies audio files into their corresponding languages.

Source file

nodejs-addon-examples/test_spoken_language_identification.js

Code

 1// Copyright (c)  2023-2024  Xiaomi Corporation
 2//
 3// Spoken language identification using a Whisper multilingual model.
 4//
 5// Usage:
 6//   node spoken_language_identification.js
 7//
 8const sherpa_onnx = require('sherpa-onnx-node');
 9
10// Download whisper multi-lingual models from
11// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
12function createSpokenLanguageID() {
13  const config = {
14    whisper: {
15      encoder: './sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx',
16      decoder: './sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx',
17    },
18    debug: true,
19    numThreads: 1,
20    provider: 'cpu',
21  };
22  return new sherpa_onnx.SpokenLanguageIdentification(config);
23}
24
25const slid = createSpokenLanguageID();
26
27const testWaves = [
28  './spoken-language-identification-test-wavs/ar-arabic.wav',
29  './spoken-language-identification-test-wavs/de-german.wav',
30  './spoken-language-identification-test-wavs/en-english.wav',
31  './spoken-language-identification-test-wavs/fr-french.wav',
32  './spoken-language-identification-test-wavs/pt-portuguese.wav',
33  './spoken-language-identification-test-wavs/es-spanish.wav',
34  './spoken-language-identification-test-wavs/zh-chinese.wav',
35];
36
37// Intl.DisplayNames converts ISO language codes to human-readable names.
38const display = new Intl.DisplayNames(['en'], {type: 'language'});
39
40for (let f of testWaves) {
41  const stream = slid.createStream();
42
43  const wave = sherpa_onnx.readWave(f);
44  stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
45
46  const lang = slid.compute(stream);
47  console.log(`${f}: ${lang} (${display.of(lang)})`);
48}

How to run

  1. Install the package:

    npm install sherpa-onnx-node
    
  2. Download the Whisper multilingual model and test files:

    curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
    tar xvf sherpa-onnx-whisper-tiny.tar.bz2
    rm sherpa-onnx-whisper-tiny.tar.bz2
    
    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2
    tar xvf spoken-language-identification-test-wavs.tar.bz2
    rm spoken-language-identification-test-wavs.tar.bz2
    
  3. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    node spoken_language_identification.js
    

Expected output

ar-arabic.wav: ar (Arabic)
de-german.wav: de (German)
en-english.wav: en (English)
fr-french.wav: fr (French)
pt-portuguese.wav: pt (Portuguese)
es-spanish.wav: es (Spanish)
zh-chinese.wav: zh (Chinese)

Notes

  • SpokenLanguageIdentification requires a Whisper multilingual model (not an English-only model).

  • compute() returns an ISO 639-1 language code (e.g., en, zh, fr).

  • Intl.DisplayNames is a built-in JavaScript API that converts language codes to human-readable names.