generated from sensein/python-package-template
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #92 from sensein/release_060
release 0.6.0 is for adding speaker verification as a task and fixing some bugs and adjusting docs building issues
- Loading branch information
Showing
11 changed files
with
696 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/Users/isaacbevers/sensein/senselab-wrapper/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Verifies whether two audio segments belong to the same speaker.""" |
61 changes: 61 additions & 0 deletions
61
src/senselab/audio/tasks/speaker_verification/speaker_verification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
"""Audio Processing and Speaker Verification Module. | ||
This module provides functions for resampling audio using an IIR filter and | ||
verifying if two audio samples or files are from the same speaker using a | ||
specified model. | ||
""" | ||
|
||
from typing import List, Optional, Tuple | ||
|
||
from torch.nn.functional import cosine_similarity | ||
|
||
from senselab.audio.data_structures.audio import Audio | ||
from senselab.audio.tasks.speaker_embeddings.speechbrain import SpeechBrainEmbeddings | ||
from senselab.utils.data_structures.device import DeviceType, _select_device_and_dtype | ||
from senselab.utils.data_structures.model import SpeechBrainModel | ||
|
||
TRAINING_SAMPLE_RATE = 16000 # spkrec-ecapa-voxceleb trained on 16kHz audio | ||
|
||
|
||
def verify_speaker( | ||
audios: List[Tuple[Audio, Audio]], | ||
model: SpeechBrainModel = SpeechBrainModel(path_or_uri="speechbrain/spkrec-ecapa-voxceleb", revision="main"), | ||
device: Optional[DeviceType] = None, | ||
threshold: float = 0.25, | ||
) -> List[Tuple[float, bool]]: | ||
"""Verifies if two audio samples are from the same speaker. | ||
Args: | ||
audios (List[Tuple[Audio, Audio]]): A list of tuples, where each tuple contains | ||
two audio samples to be compared. | ||
model (SpeechBrainModel, optional): The model for speaker verification. | ||
device (DeviceType, optional): The device to run the model on. Defaults to CPU. | ||
threshold (float, optional): The threshold to determine same speaker. | ||
Returns: | ||
List[Tuple[float, bool]]: A list of tuples containing the verification score and | ||
the prediction for each pair of audio samples. The | ||
verification score is a float indicating the similarity | ||
between the two samples, and the prediction is a boolean | ||
indicating if the two samples are from the same speaker. | ||
""" | ||
device = _select_device_and_dtype(compatible_devices=[DeviceType.CPU, DeviceType.CUDA])[0] | ||
|
||
scores_and_predictions = [] | ||
for audio1, audio2 in audios: | ||
if audio1.sampling_rate != TRAINING_SAMPLE_RATE: | ||
raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \ | ||
sample audio, but audio1 has sample rate {audio1.sampling_rate}.") | ||
if audio2.sampling_rate != TRAINING_SAMPLE_RATE: | ||
raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \ | ||
sample audio, but audio2 has sample rate {audio2.sampling_rate}.") | ||
|
||
embeddings = SpeechBrainEmbeddings.extract_speechbrain_speaker_embeddings_from_audios( | ||
audios=[audio1, audio2], model=model, device=device | ||
) | ||
embedding1, embedding2 = embeddings | ||
similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)) | ||
score = similarity.mean().item() | ||
prediction = score > threshold | ||
scores_and_predictions.append((score, prediction)) | ||
return scores_and_predictions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
"""Test Module for Audio Processing and Speaker Verification. | ||
This module contains minimal tests to ensure the audio processing and speaker verification functions do not fail. | ||
Tests: | ||
- test_resample_iir: Tests the resample_iir function. | ||
- test_verify_speaker: Tests the verify_speaker function. | ||
- test_verify_speaker_from_files: Tests the verify_speaker_from_files function. | ||
""" | ||
|
||
import os | ||
|
||
import pytest | ||
|
||
from senselab.audio.data_structures.audio import Audio | ||
from senselab.audio.tasks.preprocessing.preprocessing import resample_audios | ||
from senselab.audio.tasks.speaker_verification.speaker_verification import ( | ||
verify_speaker, | ||
) | ||
|
||
if os.getenv("GITHUB_ACTIONS") != "true": | ||
|
||
@pytest.mark.large_model | ||
def test_verify_speaker(mono_audio_sample: Audio) -> None: | ||
"""Tests the verify_speaker function to ensure it does not fail. | ||
Args: | ||
mono_audio_sample (Audio): The mono audio sample to use for testing. | ||
Returns: | ||
None | ||
""" | ||
mono_audio_sample = resample_audios([mono_audio_sample], 16000)[0] | ||
assert mono_audio_sample.sampling_rate == 16000 | ||
mono_audio_samples = [(mono_audio_sample, mono_audio_sample)] * 3 | ||
scores_and_predictions = verify_speaker(mono_audio_samples) | ||
assert scores_and_predictions | ||
assert len(scores_and_predictions[0]) == 2 | ||
assert isinstance(scores_and_predictions[0][0], float) | ||
assert isinstance(scores_and_predictions[0][1], bool) |
Oops, something went wrong.