VAD with Microphone

Detect speech from a microphone in real time using Silero VAD (Voice Activity Detection). Each detected speech segment is saved as a separate WAV file.

Source file

nodejs-addon-examples/test_vad_microphone.js

Code

  1// Copyright (c)  2023-2024  Xiaomi Corporation
  2//
  3// Voice activity detection (VAD) from a microphone using Silero VAD.
  4// Detected speech segments are saved as WAV files.
  5//
  6// Usage:
  7//   npm install node-cpal
  8//   node vad_microphone.js
  9//
 10const cpal = require('node-cpal');
 11const sherpa_onnx = require('sherpa-onnx-node');
 12
 13function createVad() {
 14  // Download silero_vad.onnx from
 15  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
 16  const config = {
 17    sileroVad: {
 18      model: './silero_vad.onnx',
 19      threshold: 0.5,
 20      minSpeechDuration: 0.25,
 21      minSilenceDuration: 0.5,
 22      windowSize: 512,
 23    },
 24    tenVad: {
 25      model: '',
 26      threshold: 0.5,
 27      minSpeechDuration: 0.25,
 28      minSilenceDuration: 0.5,
 29      windowSize: 256,
 30    },
 31    sampleRate: 16000,
 32    debug: true,
 33    numThreads: 1,
 34  };
 35
 36  const bufferSizeInSeconds = 60;
 37  return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
 38}
 39
 40const vad = createVad();
 41
 42// CircularBuffer stores incoming audio samples.
 43const bufferSizeInSeconds = 30;
 44const buffer =
 45    new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
 46
 47// Open the default microphone via node-cpal.
 48const inputDevice = cpal.getDefaultInputDevice();
 49const deviceConfig = cpal.getDefaultInputConfig(inputDevice.deviceId);
 50const nativeSampleRate = deviceConfig.sampleRate;
 51const targetSampleRate = vad.config.sampleRate;
 52const resampler = new sherpa_onnx.LinearResampler(nativeSampleRate, targetSampleRate);
 53
 54console.log(
 55    `Device: ${inputDevice.name}, native sample rate: ${nativeSampleRate} Hz`);
 56console.log(`Resampling from ${nativeSampleRate} to ${targetSampleRate} Hz`);
 57
 58const inputStream = cpal.createStream(
 59    inputDevice.deviceId,
 60    true,  // true = input/recording stream
 61    {
 62      sampleRate: nativeSampleRate,
 63      channels: 1,
 64      format: 'f32',
 65    },
 66    (data) => {
 67      // data is a Float32Array of audio samples at nativeSampleRate.
 68      // Resample to targetSampleRate before processing.
 69      const resampled = resampler.resample(data);
 70
 71      const windowSize = vad.config.sileroVad.model != '' ?
 72          vad.config.sileroVad.windowSize :
 73          vad.config.tenVad.windowSize;
 74
 75      buffer.push(resampled);
 76
 77      while (buffer.size() > windowSize) {
 78        const samples = buffer.get(buffer.head(), windowSize);
 79        buffer.pop(windowSize);
 80        vad.acceptWaveform(samples);
 81
 82        if (vad.isDetected() && !printed) {
 83          console.log(`${index}: Detected speech`);
 84          printed = true;
 85        }
 86
 87        if (!vad.isDetected()) {
 88          printed = false;
 89        }
 90
 91        // Extract completed speech segments.
 92        while (!vad.isEmpty()) {
 93          const segment = vad.front();
 94          vad.pop();
 95          const filename = `${
 96                               index}-${
 97                               new Date()
 98                                   .toLocaleTimeString('en-US', {hour12: false})
 99                                   .split(' ')[0]}.wav`
100                               .replace(/:/g, '-');
101          sherpa_onnx.writeWave(
102              filename,
103              {samples: segment.samples, sampleRate: vad.config.sampleRate});
104          const duration = segment.samples.length / vad.config.sampleRate;
105          console.log(
106              `${index} End of speech. Duration: ${duration} seconds`);
107          console.log(`Saved to ${filename}`);
108          index += 1;
109        }
110      }
111    });
112
113let printed = false;
114let index = 0;
115
116console.log('Started! Please speak');
117
118// Keep the process running. Press Ctrl+C to stop.
119process.on('SIGINT', () => {
120  cpal.closeStream(inputStream);
121  console.log('Free resources');
122  process.exit(0);
123});

How to run

  1. Install the packages:

    npm install sherpa-onnx-node
    npm install node-cpal
    
  2. Download the VAD model:

    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
    
  3. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    node vad_microphone.js
    
  4. Speak into the microphone. Detected speech segments will be printed and saved as WAV files. Press Ctrl+C to stop.

Expected output

Started! Please speak
0: Detected speech
0 End of speech. Duration: 2.345 seconds
Saved to 0-14-30-25.wav

Notes

  • node-cpal provides cross-platform microphone access via Rust’s CPAL library. Install it with npm install node-cpal.

  • cpal.getDefaultInputDevice() returns the default microphone device object. Use device.deviceId to get the device ID string.

  • cpal.getDefaultInputConfig(deviceId) returns the device’s native sample rate and format.

  • cpal.createStream(deviceId, true, config, callback) opens an input stream. The callback receives a Float32Array of audio samples.

  • new sherpa_onnx.LinearResampler(inputRate, outputRate) creates a resampler to convert audio from the device’s native sample rate to the rate required by the model (e.g., 16kHz).

  • resampler.resample(samples) resamples a Float32Array chunk and returns the resampled Float32Array.

  • CircularBuffer stores incoming microphone samples. The VAD processes audio in fixed-size windows (512 samples for Silero VAD at 16kHz).

  • isDetected() returns true when speech is currently being detected.

  • isEmpty() / front() / pop() are used to extract completed speech segments (speech followed by enough silence).

  • You can also use ten-vad.onnx instead of silero_vad.onnx.