VAD with Microphone

Detect speech from a microphone in real time using Silero VAD (Voice Activity Detection). Each detected speech segment is saved as a separate WAV file.

Source file

nodejs-addon-examples/test_vad_microphone.js

Code

// Copyright (c)  2023-2024  Xiaomi Corporation
//
// Voice activity detection (VAD) from a microphone using Silero VAD.
// Detected speech segments are saved as WAV files.
//
// Usage:
//   npm install node-cpal
//   node vad_microphone.js
//
const cpal = require('node-cpal');
const sherpa_onnx = require('sherpa-onnx-node');

function createVad() {
  // Download silero_vad.onnx from
  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  const config = {
    sileroVad: {
      model: './silero_vad.onnx',
      threshold: 0.5,
      minSpeechDuration: 0.25,
      minSilenceDuration: 0.5,
      windowSize: 512,
    },
    tenVad: {
      model: '',
      threshold: 0.5,
      minSpeechDuration: 0.25,
      minSilenceDuration: 0.5,
      windowSize: 256,
    },
    sampleRate: 16000,
    debug: true,
    numThreads: 1,
  };

  const bufferSizeInSeconds = 60;
  return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
}

const vad = createVad();

// CircularBuffer stores incoming audio samples.
const bufferSizeInSeconds = 30;
const buffer =
    new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);

// Open the default microphone via node-cpal.
const inputDevice = cpal.getDefaultInputDevice();
const deviceConfig = cpal.getDefaultInputConfig(inputDevice.deviceId);
const nativeSampleRate = deviceConfig.sampleRate;
const targetSampleRate = vad.config.sampleRate;
const resampler = new sherpa_onnx.LinearResampler(nativeSampleRate, targetSampleRate);

console.log(
    `Device: ${inputDevice.name}, native sample rate: ${nativeSampleRate} Hz`);
console.log(`Resampling from ${nativeSampleRate} to ${targetSampleRate} Hz`);

const inputStream = cpal.createStream(
    inputDevice.deviceId,
    true,  // true = input/recording stream
    {
      sampleRate: nativeSampleRate,
      channels: 1,
      format: 'f32',
    },
    (data) => {
      // data is a Float32Array of audio samples at nativeSampleRate.
      // Resample to targetSampleRate before processing.
      const resampled = resampler.resample(data);

      const windowSize = vad.config.sileroVad.model != '' ?
          vad.config.sileroVad.windowSize :
          vad.config.tenVad.windowSize;

      buffer.push(resampled);

      while (buffer.size() > windowSize) {
        const samples = buffer.get(buffer.head(), windowSize);
        buffer.pop(windowSize);
        vad.acceptWaveform(samples);

        if (vad.isDetected() && !printed) {
          console.log(`${index}: Detected speech`);
          printed = true;
        }

        if (!vad.isDetected()) {
          printed = false;
        }

        // Extract completed speech segments.
        while (!vad.isEmpty()) {
          const segment = vad.front();
          vad.pop();
          const filename = `${
                               index}-${
                               new Date()
                                   .toLocaleTimeString('en-US', {hour12: false})
                                   .split(' ')[0]}.wav`
                               .replace(/:/g, '-');
          sherpa_onnx.writeWave(
              filename,
              {samples: segment.samples, sampleRate: vad.config.sampleRate});
          const duration = segment.samples.length / vad.config.sampleRate;
          console.log(
              `${index} End of speech. Duration: ${duration} seconds`);
          console.log(`Saved to ${filename}`);
          index += 1;
        }
      }
    });

let printed = false;
let index = 0;

console.log('Started! Please speak');

// Keep the process running. Press Ctrl+C to stop.
process.on('SIGINT', () => {
  cpal.closeStream(inputStream);
  console.log('Free resources');
  process.exit(0);
});

How to run

Install the packages:

npm install sherpa-onnx-node
npm install node-cpal

Download the VAD model:

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx

Set the library path and run:

# macOS
export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH

# Linux
export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH

node vad_microphone.js

Speak into the microphone. Detected speech segments will be printed and saved as WAV files. Press Ctrl+C to stop.

Expected output

Started! Please speak
0: Detected speech
0 End of speech. Duration: 2.345 seconds
Saved to 0-14-30-25.wav

Notes

node-cpal provides cross-platform microphone access via Rust’s CPAL library. Install it with npm install node-cpal.
cpal.getDefaultInputDevice() returns the default microphone device object. Use device.deviceId to get the device ID string.
cpal.getDefaultInputConfig(deviceId) returns the device’s native sample rate and format.
cpal.createStream(deviceId, true, config, callback) opens an input stream. The callback receives a Float32Array of audio samples.
new sherpa_onnx.LinearResampler(inputRate, outputRate) creates a resampler to convert audio from the device’s native sample rate to the rate required by the model (e.g., 16kHz).
resampler.resample(samples) resamples a Float32Array chunk and returns the resampled Float32Array.
CircularBuffer stores incoming microphone samples. The VAD processes audio in fixed-size windows (512 samples for Silero VAD at 16kHz).
isDetected() returns true when speech is currently being detected.
isEmpty() / front() / pop() are used to extract completed speech segments (speech followed by enough silence).
You can also use ten-vad.onnx instead of silero_vad.onnx.