VAD with Microphone
Detect speech from a microphone in real time using Silero VAD (Voice Activity Detection). Each detected speech segment is saved as a separate WAV file.
Source file
Code
1// Copyright (c) 2023-2024 Xiaomi Corporation
2//
3// Voice activity detection (VAD) from a microphone using Silero VAD.
4// Detected speech segments are saved as WAV files.
5//
6// Usage:
7// npm install node-cpal
8// node vad_microphone.js
9//
10const cpal = require('node-cpal');
11const sherpa_onnx = require('sherpa-onnx-node');
12
13function createVad() {
14 // Download silero_vad.onnx from
15 // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
16 const config = {
17 sileroVad: {
18 model: './silero_vad.onnx',
19 threshold: 0.5,
20 minSpeechDuration: 0.25,
21 minSilenceDuration: 0.5,
22 windowSize: 512,
23 },
24 tenVad: {
25 model: '',
26 threshold: 0.5,
27 minSpeechDuration: 0.25,
28 minSilenceDuration: 0.5,
29 windowSize: 256,
30 },
31 sampleRate: 16000,
32 debug: true,
33 numThreads: 1,
34 };
35
36 const bufferSizeInSeconds = 60;
37 return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
38}
39
40const vad = createVad();
41
42// CircularBuffer stores incoming audio samples.
43const bufferSizeInSeconds = 30;
44const buffer =
45 new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
46
47// Open the default microphone via node-cpal.
48const inputDevice = cpal.getDefaultInputDevice();
49const deviceConfig = cpal.getDefaultInputConfig(inputDevice.deviceId);
50const nativeSampleRate = deviceConfig.sampleRate;
51const targetSampleRate = vad.config.sampleRate;
52const resampler = new sherpa_onnx.LinearResampler(nativeSampleRate, targetSampleRate);
53
54console.log(
55 `Device: ${inputDevice.name}, native sample rate: ${nativeSampleRate} Hz`);
56console.log(`Resampling from ${nativeSampleRate} to ${targetSampleRate} Hz`);
57
58const inputStream = cpal.createStream(
59 inputDevice.deviceId,
60 true, // true = input/recording stream
61 {
62 sampleRate: nativeSampleRate,
63 channels: 1,
64 format: 'f32',
65 },
66 (data) => {
67 // data is a Float32Array of audio samples at nativeSampleRate.
68 // Resample to targetSampleRate before processing.
69 const resampled = resampler.resample(data);
70
71 const windowSize = vad.config.sileroVad.model != '' ?
72 vad.config.sileroVad.windowSize :
73 vad.config.tenVad.windowSize;
74
75 buffer.push(resampled);
76
77 while (buffer.size() > windowSize) {
78 const samples = buffer.get(buffer.head(), windowSize);
79 buffer.pop(windowSize);
80 vad.acceptWaveform(samples);
81
82 if (vad.isDetected() && !printed) {
83 console.log(`${index}: Detected speech`);
84 printed = true;
85 }
86
87 if (!vad.isDetected()) {
88 printed = false;
89 }
90
91 // Extract completed speech segments.
92 while (!vad.isEmpty()) {
93 const segment = vad.front();
94 vad.pop();
95 const filename = `${
96 index}-${
97 new Date()
98 .toLocaleTimeString('en-US', {hour12: false})
99 .split(' ')[0]}.wav`
100 .replace(/:/g, '-');
101 sherpa_onnx.writeWave(
102 filename,
103 {samples: segment.samples, sampleRate: vad.config.sampleRate});
104 const duration = segment.samples.length / vad.config.sampleRate;
105 console.log(
106 `${index} End of speech. Duration: ${duration} seconds`);
107 console.log(`Saved to ${filename}`);
108 index += 1;
109 }
110 }
111 });
112
113let printed = false;
114let index = 0;
115
116console.log('Started! Please speak');
117
118// Keep the process running. Press Ctrl+C to stop.
119process.on('SIGINT', () => {
120 cpal.closeStream(inputStream);
121 console.log('Free resources');
122 process.exit(0);
123});
How to run
Install the packages:
npm install sherpa-onnx-node npm install node-cpal
Download the VAD model:
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
Set the library path and run:
# macOS export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH # Linux export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH node vad_microphone.js
Speak into the microphone. Detected speech segments will be printed and saved as WAV files. Press
Ctrl+Cto stop.
Expected output
Started! Please speak
0: Detected speech
0 End of speech. Duration: 2.345 seconds
Saved to 0-14-30-25.wav
Notes
node-cpal provides cross-platform microphone access via Rust’s CPAL library. Install it with
npm install node-cpal.cpal.getDefaultInputDevice()returns the default microphone device object. Usedevice.deviceIdto get the device ID string.cpal.getDefaultInputConfig(deviceId)returns the device’s native sample rate and format.cpal.createStream(deviceId, true, config, callback)opens an input stream. The callback receives aFloat32Arrayof audio samples.new sherpa_onnx.LinearResampler(inputRate, outputRate)creates a resampler to convert audio from the device’s native sample rate to the rate required by the model (e.g., 16kHz).resampler.resample(samples)resamples aFloat32Arraychunk and returns the resampledFloat32Array.CircularBufferstores incoming microphone samples. The VAD processes audio in fixed-size windows (512 samples for Silero VAD at 16kHz).isDetected()returnstruewhen speech is currently being detected.isEmpty()/front()/pop()are used to extract completed speech segments (speech followed by enough silence).You can also use
ten-vad.onnxinstead ofsilero_vad.onnx.