Speaker Identification

Identify and verify speakers using speaker embeddings. This example enrolls two speakers, then identifies test utterances and verifies a specific speaker.

Source file

nodejs-addon-examples/test_speaker_identification.js

Code

  1// Copyright (c)  2024  Xiaomi Corporation
  2//
  3// Speaker identification and verification using speaker embeddings.
  4//
  5// Usage:
  6//   node speaker_identification.js
  7//
  8const sherpa_onnx = require('sherpa-onnx-node');
  9const assert = require('node:assert');
 10
 11// Download the embedding model from
 12// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
 13function createSpeakerEmbeddingExtractor() {
 14  const config = {
 15    model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
 16    numThreads: 1,
 17    debug: true,
 18  };
 19  return new sherpa_onnx.SpeakerEmbeddingExtractor(config);
 20}
 21
 22// Helper: read a WAV file and compute its speaker embedding.
 23function computeEmbedding(extractor, filename) {
 24  const stream = extractor.createStream();
 25  const wave = sherpa_onnx.readWave(filename);
 26  stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
 27  return extractor.compute(stream);
 28}
 29
 30const extractor = createSpeakerEmbeddingExtractor();
 31
 32// SpeakerEmbeddingManager stores speaker embeddings and supports
 33// search, verify, add, and remove operations.
 34const manager = new sherpa_onnx.SpeakerEmbeddingManager(extractor.dim);
 35
 36// --- Enroll speakers ---
 37// Download test files from https://github.com/csukuangfj/sr-data
 38const spk1Files = [
 39  './sr-data/enroll/fangjun-sr-1.wav',
 40  './sr-data/enroll/fangjun-sr-2.wav',
 41  './sr-data/enroll/fangjun-sr-3.wav',
 42];
 43
 44let spk1Vec = [];
 45for (let f of spk1Files) {
 46  spk1Vec.push(computeEmbedding(extractor, f));
 47}
 48
 49const spk2Files = [
 50  './sr-data/enroll/leijun-sr-1.wav',
 51  './sr-data/enroll/leijun-sr-2.wav',
 52];
 53
 54let spk2Vec = [];
 55for (let f of spk2Files) {
 56  spk2Vec.push(computeEmbedding(extractor, f));
 57}
 58
 59// addMulti() registers a speaker with multiple enrollment utterances.
 60let ok = manager.addMulti({name: 'fangjun', v: spk1Vec});
 61assert.equal(ok, true);
 62
 63ok = manager.addMulti({name: 'leijun', v: spk2Vec});
 64assert.equal(ok, true);
 65
 66assert.equal(manager.getNumSpeakers(), 2);
 67assert.equal(manager.contains('fangjun'), true);
 68assert.equal(manager.contains('leijun'), true);
 69
 70console.log('--- All speakers ---');
 71console.log(manager.getAllSpeakerNames());
 72console.log('--------------------');
 73
 74// --- Identify test utterances ---
 75const testFiles = [
 76  './sr-data/test/fangjun-test-sr-1.wav',
 77  './sr-data/test/leijun-test-sr-1.wav',
 78  './sr-data/test/liudehua-test-sr-1.wav',
 79];
 80
 81const threshold = 0.6;
 82
 83for (let f of testFiles) {
 84  const embedding = computeEmbedding(extractor, f);
 85
 86  // search() returns the speaker name, or '' if no match above threshold.
 87  let name = manager.search({v: embedding, threshold: threshold});
 88  if (name == '') {
 89    name = '<Unknown>';
 90  }
 91  console.log(`${f}: ${name}`);
 92}
 93
 94// --- Verify a specific speaker ---
 95ok = manager.verify({
 96  name: 'fangjun',
 97  v: computeEmbedding(extractor, testFiles[0]),
 98  threshold: threshold
 99});
100assert.equal(ok, true);
101
102// --- Remove a speaker ---
103ok = manager.remove('fangjun');
104assert.equal(ok, true);
105
106ok = manager.verify({
107  name: 'fangjun',
108  v: computeEmbedding(extractor, testFiles[0]),
109  threshold: threshold
110});
111assert.equal(ok, false);
112
113assert.equal(manager.getNumSpeakers(), 1);

How to run

  1. Install the package:

    npm install sherpa-onnx-node
    
  2. Download the embedding model:

    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
    
  3. Download test audio files:

    git clone https://github.com/csukuangfj/sr-data
    
  4. Set the library path and run:

    # macOS
    export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH
    
    # Linux
    export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH
    
    node speaker_identification.js
    

Expected output

--- All speakers ---
[ 'fangjun', 'leijun' ]
--------------------
./sr-data/test/fangjun-test-sr-1.wav: fangjun
./sr-data/test/leijun-test-sr-1.wav: leijun
./sr-data/test/liudehua-test-sr-1.wav: <Unknown>

Notes

  • SpeakerEmbeddingExtractor computes a fixed-dimensional embedding vector from a WAV file.

  • SpeakerEmbeddingManager stores embeddings and provides three operations:

    • search({v, threshold}): Find the best matching speaker, or return '' if none match above the threshold.

    • verify({name, v, threshold}): Check if the embedding matches a specific enrolled speaker.

    • addMulti({name, v}): Enroll a speaker with multiple utterances for better accuracy.

  • The threshold controls the trade-off between false accepts and false rejects. Tune it for your use case.