Audio Tagging

Classify audio events in WAV files using a CED (Conditional Event Detection) model. The model identifies sounds like speech, music, animal calls, etc.

Source file

nodejs-addon-examples/test_audio_tagging_ced.js

Code

// Copyright (c)  2024  Xiaomi Corporation
//
// Audio tagging: classify audio events in WAV files using a CED model.
//
// Usage:
//   node audio_tagging.js
//
const sherpa_onnx = require('sherpa-onnx-node');

// Download models from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
function createAudioTagging() {
  const config = {
    model: {
      ced: './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/model.int8.onnx',
      numThreads: 1,
      debug: true,
    },
    labels:
        './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/class_labels_indices.csv',
    topK: 5,  // return the top-5 most probable audio events
  };
  return new sherpa_onnx.AudioTagging(config);
}

const at = createAudioTagging();

const testWaves = [
  './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/1.wav',
  './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/2.wav',
  './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/3.wav',
];

console.log('------');

for (let filename of testWaves) {
  const start = Date.now();
  const stream = at.createStream();
  const wave = sherpa_onnx.readWave(filename);
  stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});

  // compute() returns an array of {prob, name} objects.
  const events = at.compute(stream);
  const stop = Date.now();

  const elapsed_seconds = (stop - start) / 1000;
  const duration = wave.samples.length / wave.sampleRate;
  const real_time_factor = elapsed_seconds / duration;

  console.log('input file:', filename);
  console.log('Probability\t\tName');
  for (let e of events) {
    console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`);
  }
  console.log('Wave duration', duration.toFixed(3), 'seconds');
  console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds');
  console.log(
      `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
      real_time_factor.toFixed(3));
  console.log('------');
}

How to run

Install the package:
```
npm install sherpa-onnx-node
```

Download the model and test files:

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2

Set the library path and run:

# macOS
export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH

# Linux
export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH

node audio_tagging.js

Expected output

------
input file: ./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/1.wav
Probability          Name
0.987                        Speech
0.654                        Narration, monologue
0.321                        Conversation
0.123                        Inside, small room
0.045                        Telephone
Wave duration 3.200 seconds
Elapsed 0.045 seconds
RTF = 0.045/3.200 = 0.014
------

Notes

AudioTagging supports two model types: ced and zipformer.
topK controls how many events are returned (default 5).
Each event has prob (probability) and name (event label).
The labels file (class_labels_indices.csv) maps model output indices to human-readable event names.