Keyword Spotting

Detect predefined keywords in audio using a streaming Zipformer transducer model. The model listens for specific words or phrases and reports when they are detected.

Source file

nodejs-addon-examples/test_keyword_spotter_transducer.js

Code

 1// Copyright (c)  2024  Xiaomi Corporation
 2//
 3// Keyword spotting: detect predefined keywords in audio using a
 4// streaming Zipformer transducer model.
 5//
 6// Usage:
 7//   node keyword_spotter.js
 8//
 9const sherpa_onnx = require('sherpa-onnx-node');
10
11// Download models from
12// https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models
13const config = {
14  'featConfig': {
15    'sampleRate': 16000,
16    'featureDim': 80,
17  },
18  'modelConfig': {
19    'transducer': {
20      'encoder':
21          './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx',
22      'decoder':
23          './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx',
24      'joiner':
25          './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx',
26    },
27    'tokens':
28        './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt',
29    'numThreads': 1,
30    'provider': 'cpu',
31    'debug': 1,
32  },
33  'keywordsFile':
34      './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt',
35};
36
37const waveFilename =
38    './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav';
39
40const kws = new sherpa_onnx.KeywordSpotter(config);
41const stream = kws.createStream();
42
43const wave = sherpa_onnx.readWave(waveFilename);
44stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
45
46// Append tail padding.
47const tailPadding = new Float32Array(wave.sampleRate * 0.4);
48stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate});
49
50// Decode and collect detected keywords.
51const detectedKeywords = [];
52let start = Date.now();
53while (kws.isReady(stream)) {
54  kws.decode(stream);
55  const keyword = kws.getResult(stream).keyword;
56  if (keyword != '') {
57    detectedKeywords.push(keyword);
58  }
59}
60let stop = Date.now();
61
62const elapsed_seconds = (stop - start) / 1000;
63const duration = wave.samples.length / wave.sampleRate;
64const real_time_factor = elapsed_seconds / duration;
65console.log('Wave duration', duration.toFixed(3), 'seconds');
66console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
67console.log(
68    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
69    real_time_factor.toFixed(3));
70console.log(waveFilename);
71console.log('Detected keywords:', detectedKeywords);

How to run

  1. Install the package:

    npm install sherpa-onnx-node
    
  2. Download the model and test files:

    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
    tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
    rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
    
  3. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    node keyword_spotter.js
    

Expected output

Wave duration 3.456 seconds
Elapsed 0.123 seconds
RTF = 0.123/3.456 = 0.036
./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav
Detected keywords: [ '你好', '小爱同学' ]

Notes

  • The keywordsFile in the config specifies a text file containing the keywords to detect, one per line. You can edit this file to change the keywords.

  • Like streaming ASR, the keyword spotter uses a loop of isReady() and decode() calls, and checks getResult().keyword after each decode.

  • Append 0.4 seconds of tail padding after the main audio.

  • An empty keyword string means no keyword was detected in that decode step.