Speaker Identification
Identify and verify speakers using speaker embeddings. This example enrolls two speakers, then identifies test utterances and verifies a specific speaker.
Source file
Code
1// Copyright (c) 2024 Xiaomi Corporation
2//
3// Speaker identification and verification using speaker embeddings.
4//
5// Usage:
6// node speaker_identification.js
7//
8const sherpa_onnx = require('sherpa-onnx-node');
9const assert = require('node:assert');
10
11// Download the embedding model from
12// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
13function createSpeakerEmbeddingExtractor() {
14 const config = {
15 model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
16 numThreads: 1,
17 debug: true,
18 };
19 return new sherpa_onnx.SpeakerEmbeddingExtractor(config);
20}
21
22// Helper: read a WAV file and compute its speaker embedding.
23function computeEmbedding(extractor, filename) {
24 const stream = extractor.createStream();
25 const wave = sherpa_onnx.readWave(filename);
26 stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
27 return extractor.compute(stream);
28}
29
30const extractor = createSpeakerEmbeddingExtractor();
31
32// SpeakerEmbeddingManager stores speaker embeddings and supports
33// search, verify, add, and remove operations.
34const manager = new sherpa_onnx.SpeakerEmbeddingManager(extractor.dim);
35
36// --- Enroll speakers ---
37// Download test files from https://github.com/csukuangfj/sr-data
38const spk1Files = [
39 './sr-data/enroll/fangjun-sr-1.wav',
40 './sr-data/enroll/fangjun-sr-2.wav',
41 './sr-data/enroll/fangjun-sr-3.wav',
42];
43
44let spk1Vec = [];
45for (let f of spk1Files) {
46 spk1Vec.push(computeEmbedding(extractor, f));
47}
48
49const spk2Files = [
50 './sr-data/enroll/leijun-sr-1.wav',
51 './sr-data/enroll/leijun-sr-2.wav',
52];
53
54let spk2Vec = [];
55for (let f of spk2Files) {
56 spk2Vec.push(computeEmbedding(extractor, f));
57}
58
59// addMulti() registers a speaker with multiple enrollment utterances.
60let ok = manager.addMulti({name: 'fangjun', v: spk1Vec});
61assert.equal(ok, true);
62
63ok = manager.addMulti({name: 'leijun', v: spk2Vec});
64assert.equal(ok, true);
65
66assert.equal(manager.getNumSpeakers(), 2);
67assert.equal(manager.contains('fangjun'), true);
68assert.equal(manager.contains('leijun'), true);
69
70console.log('--- All speakers ---');
71console.log(manager.getAllSpeakerNames());
72console.log('--------------------');
73
74// --- Identify test utterances ---
75const testFiles = [
76 './sr-data/test/fangjun-test-sr-1.wav',
77 './sr-data/test/leijun-test-sr-1.wav',
78 './sr-data/test/liudehua-test-sr-1.wav',
79];
80
81const threshold = 0.6;
82
83for (let f of testFiles) {
84 const embedding = computeEmbedding(extractor, f);
85
86 // search() returns the speaker name, or '' if no match above threshold.
87 let name = manager.search({v: embedding, threshold: threshold});
88 if (name == '') {
89 name = '<Unknown>';
90 }
91 console.log(`${f}: ${name}`);
92}
93
94// --- Verify a specific speaker ---
95ok = manager.verify({
96 name: 'fangjun',
97 v: computeEmbedding(extractor, testFiles[0]),
98 threshold: threshold
99});
100assert.equal(ok, true);
101
102// --- Remove a speaker ---
103ok = manager.remove('fangjun');
104assert.equal(ok, true);
105
106ok = manager.verify({
107 name: 'fangjun',
108 v: computeEmbedding(extractor, testFiles[0]),
109 threshold: threshold
110});
111assert.equal(ok, false);
112
113assert.equal(manager.getNumSpeakers(), 1);
How to run
Install the package:
npm install sherpa-onnx-node
Download the embedding model:
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
Download test audio files:
git clone https://github.com/csukuangfj/sr-data
Set the library path and run:
# macOS export DYLD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$DYLD_LIBRARY_PATH # Linux export LD_LIBRARY_PATH=$(npm root)/sherpa-onnx-node/lib:$LD_LIBRARY_PATH node speaker_identification.js
Expected output
--- All speakers ---
[ 'fangjun', 'leijun' ]
--------------------
./sr-data/test/fangjun-test-sr-1.wav: fangjun
./sr-data/test/leijun-test-sr-1.wav: leijun
./sr-data/test/liudehua-test-sr-1.wav: <Unknown>
Notes
SpeakerEmbeddingExtractorcomputes a fixed-dimensional embedding vector from a WAV file.SpeakerEmbeddingManagerstores embeddings and provides three operations:search({v, threshold}): Find the best matching speaker, or return''if none match above the threshold.verify({name, v, threshold}): Check if the embedding matches a specific enrolled speaker.addMulti({name, v}): Enroll a speaker with multiple utterances for better accuracy.
The
thresholdcontrols the trade-off between false accepts and false rejects. Tune it for your use case.