Skip to content

Commit

Permalink
Merge pull request #92 from sensein/release_060
Browse files Browse the repository at this point in the history
release 0.6.0 is for adding speaker verification as a task and fixing some bugs and adjusting docs building issues
  • Loading branch information
fabiocat93 authored Jul 11, 2024
2 parents e0b8cd7 + b26be06 commit 75bef61
Show file tree
Hide file tree
Showing 11 changed files with 696 additions and 33 deletions.
25 changes: 22 additions & 3 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,34 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
- name: Install ffmpeg (Ubuntu)
if: startsWith(matrix.os, 'ubuntu')
run: sudo apt-get update && sudo apt-get install -y ffmpeg
- name: Install ffmpeg (macOS)
if: startsWith(matrix.os, 'macos')
run: brew install ffmpeg
- name: Install ffmpeg (Windows)
if: startsWith(matrix.os, 'windows')
run: choco install ffmpeg
- name: Install pipx and ensure it's up to date
run: |
python -m pip install --upgrade pipx
pipx ensurepath
shell: bash
- name: Install poetry
run: pipx install poetry==1.7.1
shell: bash
- name: Install dependencies with Poetry
run: |
python -m pip install poetry==1.7.1
poetry run pip install iso-639
poetry install --with dev,docs
shell: bash
- name: Build docs
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
APP_MODULE_NAME=$(ls src -U | head -1)
APP_MODULE_NAME=$(ls -1 src | sort | head -1)
echo "APP_MODULE_NAME: $APP_MODULE_NAME"
poetry run pdoc src/"$APP_MODULE_NAME" -o docs -t docs_style/pdoc-theme --docformat google
touch docs/.nojekyll
shell: bash
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ instance/
.scrapy

# Sphinx documentation
docs/_build/
# docs/_build/

# PyBuilder
.pybuilder/
Expand Down Expand Up @@ -167,6 +167,8 @@ cython_debug/
.idea/

data/
# pdoc documentation
docs/

# Speechbrain models
pretrained_models
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ repos:
rev: v2.3.0
hooks:
- id: codespell
args: [--skip=*.ipynb]
additional_dependencies:
- tomli

- repo: https://github.com/hija/clean-dotenv
rev: v0.0.7
hooks:
Expand Down
1 change: 1 addition & 0 deletions audio_48khz_mono_16bits.wav
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ pattern = "default-unprefixed"
[tool.codespell]
skip = [
"poetry.lock",
"docs_style/pdoc-theme/syntax-highlighting.css"
"docs_style/pdoc-theme/syntax-highlighting.css",
"*.ipynb"
]
ignore-words-list = ["senselab", "nd", "astroid", "wil", "SER"]

Expand Down
43 changes: 29 additions & 14 deletions src/senselab/audio/tasks/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,49 @@
"""This module implements some utilities for the preprocessing task."""

from typing import List, Tuple
from typing import List, Optional, Tuple

import pydra
import torchaudio.functional as F
import torch
from scipy import signal
from speechbrain.augment.time_domain import Resample

from senselab.audio.data_structures.audio import Audio


def resample_audios(audios: List[Audio], resample_rate: int, rolloff: float = 0.99) -> List[Audio]:
"""Resamples all Audios to a given sampling rate.
Takes a list of audios and resamples each into the new sampling rate. Notably does not assume any
specific structure of the audios (can vary in stereo vs. mono as well as their original sampling rate)
def resample_audios(
audios: List[Audio],
resample_rate: int,
lowcut: Optional[float] = None,
order: int = 4,
) -> List[Audio]:
"""Resamples a list of audio signals to a given sampling rate.
Args:
audios: List of Audios to resample
resample_rate: Rate at which to resample the Audio
rolloff: The roll-off frequency of the filter, as a fraction of the Nyquist.
Lower values reduce anti-aliasing, but also reduce some of the highest frequencies
audios (List[Audio]): List of audio objects to resample.
resample_rate (int): Target sampling rate.
lowcut (float, optional): Low cut frequency for IIR filter.
order (int, optional): Order of the IIR filter. Defaults to 4.
Returns:
List of Audios that have all been resampled to the given resampling rate
List[Audio]: Resampled audio objects.
"""
resampled_audios = []
for audio in audios:
resampled = F.resample(audio.waveform, audio.sampling_rate, resample_rate, rolloff=rolloff)
if lowcut is None:
lowcut = resample_rate / 2 - 100
sos = signal.butter(order, lowcut, btype="low", output="sos", fs=resample_rate)

channels = []
for channel in audio.waveform:
filtered_channel = torch.from_numpy(signal.sosfiltfilt(sos, channel.numpy()).copy()).float()
resampler = Resample(orig_freq=audio.sampling_rate, new_freq=resample_rate)
resampled_channel = resampler(filtered_channel.unsqueeze(0)).squeeze(0)
channels.append(resampled_channel)

resampled_waveform = torch.stack(channels)
resampled_audios.append(
Audio(
waveform=resampled,
waveform=resampled_waveform,
sampling_rate=resample_rate,
metadata=audio.metadata.copy(),
orig_path_or_id=audio.orig_path_or_id,
Expand Down
1 change: 1 addition & 0 deletions src/senselab/audio/tasks/speaker_verification/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Verifies whether two audio segments belong to the same speaker."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Audio Processing and Speaker Verification Module.
This module provides functions for resampling audio using an IIR filter and
verifying if two audio samples or files are from the same speaker using a
specified model.
"""

from typing import List, Optional, Tuple

from torch.nn.functional import cosine_similarity

from senselab.audio.data_structures.audio import Audio
from senselab.audio.tasks.speaker_embeddings.speechbrain import SpeechBrainEmbeddings
from senselab.utils.data_structures.device import DeviceType, _select_device_and_dtype
from senselab.utils.data_structures.model import SpeechBrainModel

TRAINING_SAMPLE_RATE = 16000 # spkrec-ecapa-voxceleb trained on 16kHz audio


def verify_speaker(
audios: List[Tuple[Audio, Audio]],
model: SpeechBrainModel = SpeechBrainModel(path_or_uri="speechbrain/spkrec-ecapa-voxceleb", revision="main"),
device: Optional[DeviceType] = None,
threshold: float = 0.25,
) -> List[Tuple[float, bool]]:
"""Verifies if two audio samples are from the same speaker.
Args:
audios (List[Tuple[Audio, Audio]]): A list of tuples, where each tuple contains
two audio samples to be compared.
model (SpeechBrainModel, optional): The model for speaker verification.
device (DeviceType, optional): The device to run the model on. Defaults to CPU.
threshold (float, optional): The threshold to determine same speaker.
Returns:
List[Tuple[float, bool]]: A list of tuples containing the verification score and
the prediction for each pair of audio samples. The
verification score is a float indicating the similarity
between the two samples, and the prediction is a boolean
indicating if the two samples are from the same speaker.
"""
device = _select_device_and_dtype(compatible_devices=[DeviceType.CPU, DeviceType.CUDA])[0]

scores_and_predictions = []
for audio1, audio2 in audios:
if audio1.sampling_rate != TRAINING_SAMPLE_RATE:
raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \
sample audio, but audio1 has sample rate {audio1.sampling_rate}.")
if audio2.sampling_rate != TRAINING_SAMPLE_RATE:
raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \
sample audio, but audio2 has sample rate {audio2.sampling_rate}.")

embeddings = SpeechBrainEmbeddings.extract_speechbrain_speaker_embeddings_from_audios(
audios=[audio1, audio2], model=model, device=device
)
embedding1, embedding2 = embeddings
similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
score = similarity.mean().item()
prediction = score > threshold
scores_and_predictions.append((score, prediction))
return scores_and_predictions
24 changes: 11 additions & 13 deletions src/senselab/audio/tasks/speech_enhancement/speechbrain.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This module provides the Speechbrain interface for speech enhancement."""

from typing import Dict, List, Optional

import torch
Expand Down Expand Up @@ -35,19 +36,16 @@ def _get_speechbrain_model(
)
key = f"{model.path_or_uri}-{model.revision}-{device.value}"
if key not in cls._models:
cls._models[key] = separator.from_hparams(
source=model.path_or_uri,
run_opts={"device": device.value}
)
cls._models[key] = separator.from_hparams(source=model.path_or_uri, run_opts={"device": device.value})
return cls._models[key]

@classmethod
def enhance_audios_with_speechbrain(
cls,
audios: List[Audio],
model: SpeechBrainModel = SpeechBrainModel(
path_or_uri="speechbrain/sepformer-wham16k-enhancement",
revision="main"),
path_or_uri="speechbrain/sepformer-wham16k-enhancement", revision="main"
),
device: Optional[DeviceType] = None,
) -> List[Audio]:
"""Enhances all audio samples in the dataset.
Expand All @@ -59,7 +57,7 @@ def enhance_audios_with_speechbrain(
Returns:
List[Audio]: The list of enhanced audio objects.
Todo:
- Optimizing the computation by working in batches
- Double-checking the input size of enhancer.encode_batch
Expand All @@ -70,13 +68,13 @@ def enhance_audios_with_speechbrain(
# Check that all audio objects have the correct sampling rate
for audio in audios:
if audio.waveform.shape[0] != 1:
raise ValueError(
f"Audio waveform must be mono (1 channel), but got {audio.waveform.shape[0]} channels"
)
raise ValueError(f"Audio waveform must be mono (1 channel), but got {audio.waveform.shape[0]} channels")
if audio.sampling_rate != expected_sample_rate:
raise ValueError(
"Audio sampling rate " + str(audio.sampling_rate) +
" does not match expected " + str(expected_sample_rate)
"Audio sampling rate "
+ str(audio.sampling_rate)
+ " does not match expected "
+ str(expected_sample_rate)
)

# Stack audio waveforms for batch processing
Expand All @@ -87,6 +85,6 @@ def enhance_audios_with_speechbrain(

# Update the original audio objects with the enhanced waveforms
for audio, enhanced_waveform in zip(audios, enhanced_waveforms):
audio.waveform = enhanced_waveform
audio.waveform = enhanced_waveform.reshape(1, -1)

return audios
40 changes: 40 additions & 0 deletions src/tests/audio/tasks/speaker_verification_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Test Module for Audio Processing and Speaker Verification.
This module contains minimal tests to ensure the audio processing and speaker verification functions do not fail.
Tests:
- test_resample_iir: Tests the resample_iir function.
- test_verify_speaker: Tests the verify_speaker function.
- test_verify_speaker_from_files: Tests the verify_speaker_from_files function.
"""

import os

import pytest

from senselab.audio.data_structures.audio import Audio
from senselab.audio.tasks.preprocessing.preprocessing import resample_audios
from senselab.audio.tasks.speaker_verification.speaker_verification import (
verify_speaker,
)

if os.getenv("GITHUB_ACTIONS") != "true":

@pytest.mark.large_model
def test_verify_speaker(mono_audio_sample: Audio) -> None:
"""Tests the verify_speaker function to ensure it does not fail.
Args:
mono_audio_sample (Audio): The mono audio sample to use for testing.
Returns:
None
"""
mono_audio_sample = resample_audios([mono_audio_sample], 16000)[0]
assert mono_audio_sample.sampling_rate == 16000
mono_audio_samples = [(mono_audio_sample, mono_audio_sample)] * 3
scores_and_predictions = verify_speaker(mono_audio_samples)
assert scores_and_predictions
assert len(scores_and_predictions[0]) == 2
assert isinstance(scores_and_predictions[0][0], float)
assert isinstance(scores_and_predictions[0][1], bool)
Loading

0 comments on commit 75bef61

Please sign in to comment.