Audio Tagging

Classify audio events in WAV files using a CED (Conditional Event Detection) model. The model identifies sounds like speech, music, animal calls, etc.

Source file

nodejs-addon-examples/test_audio_tagging_ced.js

Code

 1// Copyright (c)  2024  Xiaomi Corporation
 2//
 3// Audio tagging: classify audio events in WAV files using a CED model.
 4//
 5// Usage:
 6//   node audio_tagging.js
 7//
 8const sherpa_onnx = require('sherpa-onnx-node');
 9
10// Download models from
11// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
12function createAudioTagging() {
13  const config = {
14    model: {
15      ced: './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/model.int8.onnx',
16      numThreads: 1,
17      debug: true,
18    },
19    labels:
20        './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/class_labels_indices.csv',
21    topK: 5,  // return the top-5 most probable audio events
22  };
23  return new sherpa_onnx.AudioTagging(config);
24}
25
26const at = createAudioTagging();
27
28const testWaves = [
29  './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/1.wav',
30  './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/2.wav',
31  './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/3.wav',
32];
33
34console.log('------');
35
36for (let filename of testWaves) {
37  const start = Date.now();
38  const stream = at.createStream();
39  const wave = sherpa_onnx.readWave(filename);
40  stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
41
42  // compute() returns an array of {prob, name} objects.
43  const events = at.compute(stream);
44  const stop = Date.now();
45
46  const elapsed_seconds = (stop - start) / 1000;
47  const duration = wave.samples.length / wave.sampleRate;
48  const real_time_factor = elapsed_seconds / duration;
49
50  console.log('input file:', filename);
51  console.log('Probability\t\tName');
52  for (let e of events) {
53    console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`);
54  }
55  console.log('Wave duration', duration.toFixed(3), 'seconds');
56  console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
57  console.log(
58      `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
59      real_time_factor.toFixed(3));
60  console.log('------');
61}

How to run

  1. Install the package:

    npm install sherpa-onnx-node
    
  2. Download the model and test files:

    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
    tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
    rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
    
  3. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    node audio_tagging.js
    

Expected output

------
input file: ./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/1.wav
Probability          Name
0.987                        Speech
0.654                        Narration, monologue
0.321                        Conversation
0.123                        Inside, small room
0.045                        Telephone
Wave duration 3.200 seconds
Elapsed 0.045 seconds
RTF = 0.045/3.200 = 0.014
------

Notes

  • AudioTagging supports two model types: ced and zipformer.

  • topK controls how many events are returned (default 5).

  • Each event has prob (probability) and name (event label).

  • The labels file (class_labels_indices.csv) maps model output indices to human-readable event names.