Merge pull request #92 from sensein/release_060

release 0.6.0 is for adding speaker verification as a task and fixing some bugs and adjusting docs building issues
sensein · Jul 11, 2024 · 75bef61 · 75bef61
2 parents e0b8cd7 + b26be06
commit 75bef61
Show file tree

Hide file tree

Showing 11 changed files with 696 additions and 33 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -30,15 +30,34 @@ jobs:
     - uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
+    - name: Install ffmpeg (Ubuntu)
+      if: startsWith(matrix.os, 'ubuntu')
+      run: sudo apt-get update && sudo apt-get install -y ffmpeg
+    - name: Install ffmpeg (macOS)
+      if: startsWith(matrix.os, 'macos')
+      run: brew install ffmpeg
+    - name: Install ffmpeg (Windows)
+      if: startsWith(matrix.os, 'windows')
+      run: choco install ffmpeg
+    - name: Install pipx and ensure it's up to date
+      run: |
+        python -m pip install --upgrade pipx
+        pipx ensurepath
+      shell: bash
+    - name: Install poetry
+      run: pipx install poetry==1.7.1
+      shell: bash
+    - name: Install dependencies with Poetry
       run: |
-        python -m pip install poetry==1.7.1
         poetry run pip install iso-639
         poetry install --with dev,docs
       shell: bash
     - name: Build docs
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
-        APP_MODULE_NAME=$(ls src -U | head -1)
+        APP_MODULE_NAME=$(ls -1 src | sort | head -1)
+        echo "APP_MODULE_NAME: $APP_MODULE_NAME"
         poetry run pdoc src/"$APP_MODULE_NAME" -o docs -t docs_style/pdoc-theme --docformat google
         touch docs/.nojekyll
       shell: bash

diff --git a/.gitignore b/.gitignore
@@ -76,7 +76,7 @@ instance/
 .scrapy
 
 # Sphinx documentation
-docs/_build/
+# docs/_build/
 
 # PyBuilder
 .pybuilder/
@@ -167,6 +167,8 @@ cython_debug/
 .idea/
 
 data/
+# pdoc documentation
+docs/
 
 # Speechbrain models
 pretrained_models

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -57,9 +57,9 @@ repos:
   rev: v2.3.0
   hooks:
   - id: codespell
+    args: [--skip=*.ipynb]
     additional_dependencies:
     - tomli
-
 - repo: https://github.com/hija/clean-dotenv
   rev: v0.0.7
   hooks:

diff --git a/audio_48khz_mono_16bits.wav b/audio_48khz_mono_16bits.wav
@@ -0,0 +1 @@
+/Users/isaacbevers/sensein/senselab-wrapper/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav
diff --git a/pyproject.toml b/pyproject.toml
@@ -150,7 +150,8 @@ pattern = "default-unprefixed"
 [tool.codespell]
 skip = [
   "poetry.lock",
-  "docs_style/pdoc-theme/syntax-highlighting.css"
+  "docs_style/pdoc-theme/syntax-highlighting.css",
+  "*.ipynb"
 ]
 ignore-words-list = ["senselab", "nd", "astroid", "wil", "SER"]
 

diff --git a/src/senselab/audio/tasks/preprocessing/preprocessing.py b/src/senselab/audio/tasks/preprocessing/preprocessing.py
@@ -1,34 +1,49 @@
 """This module implements some utilities for the preprocessing task."""
 
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pydra
-import torchaudio.functional as F
+import torch
+from scipy import signal
+from speechbrain.augment.time_domain import Resample
 
 from senselab.audio.data_structures.audio import Audio
 
 
-def resample_audios(audios: List[Audio], resample_rate: int, rolloff: float = 0.99) -> List[Audio]:
-    """Resamples all Audios to a given sampling rate.
-
-    Takes a list of audios and resamples each into the new sampling rate. Notably does not assume any
-    specific structure of the audios (can vary in stereo vs. mono as well as their original sampling rate)
+def resample_audios(
+    audios: List[Audio],
+    resample_rate: int,
+    lowcut: Optional[float] = None,
+    order: int = 4,
+) -> List[Audio]:
+    """Resamples a list of audio signals to a given sampling rate.
 
     Args:
-        audios: List of Audios to resample
-        resample_rate: Rate at which to resample the Audio
-        rolloff: The roll-off frequency of the filter, as a fraction of the Nyquist.
-            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies
+        audios (List[Audio]): List of audio objects to resample.
+        resample_rate (int): Target sampling rate.
+        lowcut (float, optional): Low cut frequency for IIR filter.
+        order (int, optional): Order of the IIR filter. Defaults to 4.
 
     Returns:
-        List of Audios that have all been resampled to the given resampling rate
+        List[Audio]: Resampled audio objects.
     """
     resampled_audios = []
     for audio in audios:
-        resampled = F.resample(audio.waveform, audio.sampling_rate, resample_rate, rolloff=rolloff)
+        if lowcut is None:
+            lowcut = resample_rate / 2 - 100
+        sos = signal.butter(order, lowcut, btype="low", output="sos", fs=resample_rate)
+
+        channels = []
+        for channel in audio.waveform:
+            filtered_channel = torch.from_numpy(signal.sosfiltfilt(sos, channel.numpy()).copy()).float()
+            resampler = Resample(orig_freq=audio.sampling_rate, new_freq=resample_rate)
+            resampled_channel = resampler(filtered_channel.unsqueeze(0)).squeeze(0)
+            channels.append(resampled_channel)
+
+        resampled_waveform = torch.stack(channels)
         resampled_audios.append(
             Audio(
-                waveform=resampled,
+                waveform=resampled_waveform,
                 sampling_rate=resample_rate,
                 metadata=audio.metadata.copy(),
                 orig_path_or_id=audio.orig_path_or_id,

diff --git a/src/senselab/audio/tasks/speaker_verification/__init__.py b/src/senselab/audio/tasks/speaker_verification/__init__.py
@@ -0,0 +1 @@
+"""Verifies whether two audio segments belong to the same speaker."""
diff --git a/src/senselab/audio/tasks/speaker_verification/speaker_verification.py b/src/senselab/audio/tasks/speaker_verification/speaker_verification.py
@@ -0,0 +1,61 @@
+"""Audio Processing and Speaker Verification Module.
+
+This module provides functions for resampling audio using an IIR filter and
+verifying if two audio samples or files are from the same speaker using a
+specified model.
+"""
+
+from typing import List, Optional, Tuple
+
+from torch.nn.functional import cosine_similarity
+
+from senselab.audio.data_structures.audio import Audio
+from senselab.audio.tasks.speaker_embeddings.speechbrain import SpeechBrainEmbeddings
+from senselab.utils.data_structures.device import DeviceType, _select_device_and_dtype
+from senselab.utils.data_structures.model import SpeechBrainModel
+
+TRAINING_SAMPLE_RATE = 16000  # spkrec-ecapa-voxceleb trained on 16kHz audio
+
+
+def verify_speaker(
+    audios: List[Tuple[Audio, Audio]],
+    model: SpeechBrainModel = SpeechBrainModel(path_or_uri="speechbrain/spkrec-ecapa-voxceleb", revision="main"),
+    device: Optional[DeviceType] = None,
+    threshold: float = 0.25,
+) -> List[Tuple[float, bool]]:
+    """Verifies if two audio samples are from the same speaker.
+
+    Args:
+        audios (List[Tuple[Audio, Audio]]): A list of tuples, where each tuple contains
+                                            two audio samples to be compared.
+        model (SpeechBrainModel, optional): The model for speaker verification.
+        device (DeviceType, optional): The device to run the model on. Defaults to CPU.
+        threshold (float, optional): The threshold to determine same speaker.
+
+    Returns:
+        List[Tuple[float, bool]]: A list of tuples containing the verification score and
+                                  the prediction for each pair of audio samples. The
+                                  verification score is a float indicating the similarity
+                                  between the two samples, and the prediction is a boolean
+                                  indicating if the two samples are from the same speaker.
+    """
+    device = _select_device_and_dtype(compatible_devices=[DeviceType.CPU, DeviceType.CUDA])[0]
+
+    scores_and_predictions = []
+    for audio1, audio2 in audios:
+        if audio1.sampling_rate != TRAINING_SAMPLE_RATE:
+            raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \
+                                sample audio, but audio1 has sample rate {audio1.sampling_rate}.")
+        if audio2.sampling_rate != TRAINING_SAMPLE_RATE:
+            raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \
+                            sample audio, but audio2 has sample rate {audio2.sampling_rate}.")
+
+        embeddings = SpeechBrainEmbeddings.extract_speechbrain_speaker_embeddings_from_audios(
+            audios=[audio1, audio2], model=model, device=device
+        )
+        embedding1, embedding2 = embeddings
+        similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
+        score = similarity.mean().item()
+        prediction = score > threshold
+        scores_and_predictions.append((score, prediction))
+    return scores_and_predictions
diff --git a/src/senselab/audio/tasks/speech_enhancement/speechbrain.py b/src/senselab/audio/tasks/speech_enhancement/speechbrain.py
@@ -1,4 +1,5 @@
 """This module provides the Speechbrain interface for speech enhancement."""
+
 from typing import Dict, List, Optional
 
 import torch
@@ -35,19 +36,16 @@ def _get_speechbrain_model(
         )
         key = f"{model.path_or_uri}-{model.revision}-{device.value}"
         if key not in cls._models:
-            cls._models[key] = separator.from_hparams(
-                source=model.path_or_uri, 
-                run_opts={"device": device.value}
-            )
+            cls._models[key] = separator.from_hparams(source=model.path_or_uri, run_opts={"device": device.value})
         return cls._models[key]
 
     @classmethod
     def enhance_audios_with_speechbrain(
         cls,
         audios: List[Audio],
         model: SpeechBrainModel = SpeechBrainModel(
-            path_or_uri="speechbrain/sepformer-wham16k-enhancement", 
-            revision="main"),
+            path_or_uri="speechbrain/sepformer-wham16k-enhancement", revision="main"
+        ),
         device: Optional[DeviceType] = None,
     ) -> List[Audio]:
         """Enhances all audio samples in the dataset.
@@ -59,7 +57,7 @@ def enhance_audios_with_speechbrain(
 
         Returns:
             List[Audio]: The list of enhanced audio objects.
-        
+
         Todo:
             - Optimizing the computation by working in batches
             - Double-checking the input size of enhancer.encode_batch
@@ -70,13 +68,13 @@ def enhance_audios_with_speechbrain(
         # Check that all audio objects have the correct sampling rate
         for audio in audios:
             if audio.waveform.shape[0] != 1:
-                raise ValueError(
-                    f"Audio waveform must be mono (1 channel), but got {audio.waveform.shape[0]} channels"
-                )
+                raise ValueError(f"Audio waveform must be mono (1 channel), but got {audio.waveform.shape[0]} channels")
             if audio.sampling_rate != expected_sample_rate:
                 raise ValueError(
-                    "Audio sampling rate " + str(audio.sampling_rate) + 
-                    " does not match expected " + str(expected_sample_rate)
+                    "Audio sampling rate "
+                    + str(audio.sampling_rate)
+                    + " does not match expected "
+                    + str(expected_sample_rate)
                 )
 
         # Stack audio waveforms for batch processing
@@ -87,6 +85,6 @@ def enhance_audios_with_speechbrain(
 
         # Update the original audio objects with the enhanced waveforms
         for audio, enhanced_waveform in zip(audios, enhanced_waveforms):
-            audio.waveform = enhanced_waveform
+            audio.waveform = enhanced_waveform.reshape(1, -1)
 
         return audios
diff --git a/src/tests/audio/tasks/speaker_verification_test.py b/src/tests/audio/tasks/speaker_verification_test.py
@@ -0,0 +1,40 @@
+"""Test Module for Audio Processing and Speaker Verification.
+
+This module contains minimal tests to ensure the audio processing and speaker verification functions do not fail.
+
+Tests:
+    - test_resample_iir: Tests the resample_iir function.
+    - test_verify_speaker: Tests the verify_speaker function.
+    - test_verify_speaker_from_files: Tests the verify_speaker_from_files function.
+"""
+
+import os
+
+import pytest
+
+from senselab.audio.data_structures.audio import Audio
+from senselab.audio.tasks.preprocessing.preprocessing import resample_audios
+from senselab.audio.tasks.speaker_verification.speaker_verification import (
+    verify_speaker,
+)
+
+if os.getenv("GITHUB_ACTIONS") != "true":
+
+    @pytest.mark.large_model
+    def test_verify_speaker(mono_audio_sample: Audio) -> None:
+        """Tests the verify_speaker function to ensure it does not fail.
+
+        Args:
+            mono_audio_sample (Audio): The mono audio sample to use for testing.
+
+        Returns:
+            None
+        """
+        mono_audio_sample = resample_audios([mono_audio_sample], 16000)[0]
+        assert mono_audio_sample.sampling_rate == 16000
+        mono_audio_samples = [(mono_audio_sample, mono_audio_sample)] * 3
+        scores_and_predictions = verify_speaker(mono_audio_samples)
+        assert scores_and_predictions
+        assert len(scores_and_predictions[0]) == 2
+        assert isinstance(scores_and_predictions[0][0], float)
+        assert isinstance(scores_and_predictions[0][1], bool)