diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 08e8e1f3..ecee7d6f 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -30,15 +30,34 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install ffmpeg (Ubuntu) + if: startsWith(matrix.os, 'ubuntu') + run: sudo apt-get update && sudo apt-get install -y ffmpeg + - name: Install ffmpeg (macOS) + if: startsWith(matrix.os, 'macos') + run: brew install ffmpeg + - name: Install ffmpeg (Windows) + if: startsWith(matrix.os, 'windows') + run: choco install ffmpeg + - name: Install pipx and ensure it's up to date + run: | + python -m pip install --upgrade pipx + pipx ensurepath + shell: bash + - name: Install poetry + run: pipx install poetry==1.7.1 + shell: bash + - name: Install dependencies with Poetry run: | - python -m pip install poetry==1.7.1 poetry run pip install iso-639 poetry install --with dev,docs shell: bash - name: Build docs + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - APP_MODULE_NAME=$(ls src -U | head -1) + APP_MODULE_NAME=$(ls -1 src | sort | head -1) + echo "APP_MODULE_NAME: $APP_MODULE_NAME" poetry run pdoc src/"$APP_MODULE_NAME" -o docs -t docs_style/pdoc-theme --docformat google touch docs/.nojekyll shell: bash diff --git a/.gitignore b/.gitignore index cdf50e29..4ddeae03 100644 --- a/.gitignore +++ b/.gitignore @@ -76,7 +76,7 @@ instance/ .scrapy # Sphinx documentation -docs/_build/ +# docs/_build/ # PyBuilder .pybuilder/ @@ -167,6 +167,8 @@ cython_debug/ .idea/ data/ +# pdoc documentation +docs/ # Speechbrain models pretrained_models diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6c4ae8d9..b5880847 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,9 +57,9 @@ repos: rev: v2.3.0 hooks: - id: codespell + args: [--skip=*.ipynb] additional_dependencies: - tomli - - repo: https://github.com/hija/clean-dotenv rev: v0.0.7 hooks: diff --git a/audio_48khz_mono_16bits.wav b/audio_48khz_mono_16bits.wav new file mode 120000 index 00000000..6f4d4163 --- /dev/null +++ b/audio_48khz_mono_16bits.wav @@ -0,0 +1 @@ +/Users/isaacbevers/sensein/senselab-wrapper/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1e9a6748..173100a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,7 +150,8 @@ pattern = "default-unprefixed" [tool.codespell] skip = [ "poetry.lock", - "docs_style/pdoc-theme/syntax-highlighting.css" + "docs_style/pdoc-theme/syntax-highlighting.css", + "*.ipynb" ] ignore-words-list = ["senselab", "nd", "astroid", "wil", "SER"] diff --git a/src/senselab/audio/tasks/preprocessing/preprocessing.py b/src/senselab/audio/tasks/preprocessing/preprocessing.py index 3e928569..1ba5404b 100644 --- a/src/senselab/audio/tasks/preprocessing/preprocessing.py +++ b/src/senselab/audio/tasks/preprocessing/preprocessing.py @@ -1,34 +1,49 @@ """This module implements some utilities for the preprocessing task.""" -from typing import List, Tuple +from typing import List, Optional, Tuple import pydra -import torchaudio.functional as F +import torch +from scipy import signal +from speechbrain.augment.time_domain import Resample from senselab.audio.data_structures.audio import Audio -def resample_audios(audios: List[Audio], resample_rate: int, rolloff: float = 0.99) -> List[Audio]: - """Resamples all Audios to a given sampling rate. - - Takes a list of audios and resamples each into the new sampling rate. Notably does not assume any - specific structure of the audios (can vary in stereo vs. mono as well as their original sampling rate) +def resample_audios( + audios: List[Audio], + resample_rate: int, + lowcut: Optional[float] = None, + order: int = 4, +) -> List[Audio]: + """Resamples a list of audio signals to a given sampling rate. Args: - audios: List of Audios to resample - resample_rate: Rate at which to resample the Audio - rolloff: The roll-off frequency of the filter, as a fraction of the Nyquist. - Lower values reduce anti-aliasing, but also reduce some of the highest frequencies + audios (List[Audio]): List of audio objects to resample. + resample_rate (int): Target sampling rate. + lowcut (float, optional): Low cut frequency for IIR filter. + order (int, optional): Order of the IIR filter. Defaults to 4. Returns: - List of Audios that have all been resampled to the given resampling rate + List[Audio]: Resampled audio objects. """ resampled_audios = [] for audio in audios: - resampled = F.resample(audio.waveform, audio.sampling_rate, resample_rate, rolloff=rolloff) + if lowcut is None: + lowcut = resample_rate / 2 - 100 + sos = signal.butter(order, lowcut, btype="low", output="sos", fs=resample_rate) + + channels = [] + for channel in audio.waveform: + filtered_channel = torch.from_numpy(signal.sosfiltfilt(sos, channel.numpy()).copy()).float() + resampler = Resample(orig_freq=audio.sampling_rate, new_freq=resample_rate) + resampled_channel = resampler(filtered_channel.unsqueeze(0)).squeeze(0) + channels.append(resampled_channel) + + resampled_waveform = torch.stack(channels) resampled_audios.append( Audio( - waveform=resampled, + waveform=resampled_waveform, sampling_rate=resample_rate, metadata=audio.metadata.copy(), orig_path_or_id=audio.orig_path_or_id, diff --git a/src/senselab/audio/tasks/speaker_verification/__init__.py b/src/senselab/audio/tasks/speaker_verification/__init__.py new file mode 100644 index 00000000..941cfc0b --- /dev/null +++ b/src/senselab/audio/tasks/speaker_verification/__init__.py @@ -0,0 +1 @@ +"""Verifies whether two audio segments belong to the same speaker.""" diff --git a/src/senselab/audio/tasks/speaker_verification/speaker_verification.py b/src/senselab/audio/tasks/speaker_verification/speaker_verification.py new file mode 100644 index 00000000..5f739ff8 --- /dev/null +++ b/src/senselab/audio/tasks/speaker_verification/speaker_verification.py @@ -0,0 +1,61 @@ +"""Audio Processing and Speaker Verification Module. + +This module provides functions for resampling audio using an IIR filter and +verifying if two audio samples or files are from the same speaker using a +specified model. +""" + +from typing import List, Optional, Tuple + +from torch.nn.functional import cosine_similarity + +from senselab.audio.data_structures.audio import Audio +from senselab.audio.tasks.speaker_embeddings.speechbrain import SpeechBrainEmbeddings +from senselab.utils.data_structures.device import DeviceType, _select_device_and_dtype +from senselab.utils.data_structures.model import SpeechBrainModel + +TRAINING_SAMPLE_RATE = 16000 # spkrec-ecapa-voxceleb trained on 16kHz audio + + +def verify_speaker( + audios: List[Tuple[Audio, Audio]], + model: SpeechBrainModel = SpeechBrainModel(path_or_uri="speechbrain/spkrec-ecapa-voxceleb", revision="main"), + device: Optional[DeviceType] = None, + threshold: float = 0.25, +) -> List[Tuple[float, bool]]: + """Verifies if two audio samples are from the same speaker. + + Args: + audios (List[Tuple[Audio, Audio]]): A list of tuples, where each tuple contains + two audio samples to be compared. + model (SpeechBrainModel, optional): The model for speaker verification. + device (DeviceType, optional): The device to run the model on. Defaults to CPU. + threshold (float, optional): The threshold to determine same speaker. + + Returns: + List[Tuple[float, bool]]: A list of tuples containing the verification score and + the prediction for each pair of audio samples. The + verification score is a float indicating the similarity + between the two samples, and the prediction is a boolean + indicating if the two samples are from the same speaker. + """ + device = _select_device_and_dtype(compatible_devices=[DeviceType.CPU, DeviceType.CUDA])[0] + + scores_and_predictions = [] + for audio1, audio2 in audios: + if audio1.sampling_rate != TRAINING_SAMPLE_RATE: + raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \ + sample audio, but audio1 has sample rate {audio1.sampling_rate}.") + if audio2.sampling_rate != TRAINING_SAMPLE_RATE: + raise ValueError(f"{model.path_or_uri} trained on {TRAINING_SAMPLE_RATE} \ + sample audio, but audio2 has sample rate {audio2.sampling_rate}.") + + embeddings = SpeechBrainEmbeddings.extract_speechbrain_speaker_embeddings_from_audios( + audios=[audio1, audio2], model=model, device=device + ) + embedding1, embedding2 = embeddings + similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)) + score = similarity.mean().item() + prediction = score > threshold + scores_and_predictions.append((score, prediction)) + return scores_and_predictions diff --git a/src/senselab/audio/tasks/speech_enhancement/speechbrain.py b/src/senselab/audio/tasks/speech_enhancement/speechbrain.py index 47ad362a..fea33dc2 100644 --- a/src/senselab/audio/tasks/speech_enhancement/speechbrain.py +++ b/src/senselab/audio/tasks/speech_enhancement/speechbrain.py @@ -1,4 +1,5 @@ """This module provides the Speechbrain interface for speech enhancement.""" + from typing import Dict, List, Optional import torch @@ -35,10 +36,7 @@ def _get_speechbrain_model( ) key = f"{model.path_or_uri}-{model.revision}-{device.value}" if key not in cls._models: - cls._models[key] = separator.from_hparams( - source=model.path_or_uri, - run_opts={"device": device.value} - ) + cls._models[key] = separator.from_hparams(source=model.path_or_uri, run_opts={"device": device.value}) return cls._models[key] @classmethod @@ -46,8 +44,8 @@ def enhance_audios_with_speechbrain( cls, audios: List[Audio], model: SpeechBrainModel = SpeechBrainModel( - path_or_uri="speechbrain/sepformer-wham16k-enhancement", - revision="main"), + path_or_uri="speechbrain/sepformer-wham16k-enhancement", revision="main" + ), device: Optional[DeviceType] = None, ) -> List[Audio]: """Enhances all audio samples in the dataset. @@ -59,7 +57,7 @@ def enhance_audios_with_speechbrain( Returns: List[Audio]: The list of enhanced audio objects. - + Todo: - Optimizing the computation by working in batches - Double-checking the input size of enhancer.encode_batch @@ -70,13 +68,13 @@ def enhance_audios_with_speechbrain( # Check that all audio objects have the correct sampling rate for audio in audios: if audio.waveform.shape[0] != 1: - raise ValueError( - f"Audio waveform must be mono (1 channel), but got {audio.waveform.shape[0]} channels" - ) + raise ValueError(f"Audio waveform must be mono (1 channel), but got {audio.waveform.shape[0]} channels") if audio.sampling_rate != expected_sample_rate: raise ValueError( - "Audio sampling rate " + str(audio.sampling_rate) + - " does not match expected " + str(expected_sample_rate) + "Audio sampling rate " + + str(audio.sampling_rate) + + " does not match expected " + + str(expected_sample_rate) ) # Stack audio waveforms for batch processing @@ -87,6 +85,6 @@ def enhance_audios_with_speechbrain( # Update the original audio objects with the enhanced waveforms for audio, enhanced_waveform in zip(audios, enhanced_waveforms): - audio.waveform = enhanced_waveform + audio.waveform = enhanced_waveform.reshape(1, -1) return audios diff --git a/src/tests/audio/tasks/speaker_verification_test.py b/src/tests/audio/tasks/speaker_verification_test.py new file mode 100644 index 00000000..f42ccd38 --- /dev/null +++ b/src/tests/audio/tasks/speaker_verification_test.py @@ -0,0 +1,40 @@ +"""Test Module for Audio Processing and Speaker Verification. + +This module contains minimal tests to ensure the audio processing and speaker verification functions do not fail. + +Tests: + - test_resample_iir: Tests the resample_iir function. + - test_verify_speaker: Tests the verify_speaker function. + - test_verify_speaker_from_files: Tests the verify_speaker_from_files function. +""" + +import os + +import pytest + +from senselab.audio.data_structures.audio import Audio +from senselab.audio.tasks.preprocessing.preprocessing import resample_audios +from senselab.audio.tasks.speaker_verification.speaker_verification import ( + verify_speaker, +) + +if os.getenv("GITHUB_ACTIONS") != "true": + + @pytest.mark.large_model + def test_verify_speaker(mono_audio_sample: Audio) -> None: + """Tests the verify_speaker function to ensure it does not fail. + + Args: + mono_audio_sample (Audio): The mono audio sample to use for testing. + + Returns: + None + """ + mono_audio_sample = resample_audios([mono_audio_sample], 16000)[0] + assert mono_audio_sample.sampling_rate == 16000 + mono_audio_samples = [(mono_audio_sample, mono_audio_sample)] * 3 + scores_and_predictions = verify_speaker(mono_audio_samples) + assert scores_and_predictions + assert len(scores_and_predictions[0]) == 2 + assert isinstance(scores_and_predictions[0][0], float) + assert isinstance(scores_and_predictions[0][1], bool) diff --git a/tutorials/getting_started.ipynb b/tutorials/getting_started.ipynb new file mode 100644 index 00000000..f7216ad5 --- /dev/null +++ b/tutorials/getting_started.ipynb @@ -0,0 +1,525 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Started with ```senselab```\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/getting_started.ipynb)\n", + "\n", + "\n", + "Welcome to the `senselab` quick start tutorial! \n", + "\n", + "This guide will showcase some of the key functionalities offered by `senselab`. We'll cover how to read, preprocess, analyze, and manipulate audio data. For more details, please check the documentation and task-specific tutorials. \n", + "\n", + "Note that the package evolves continuously, so if you find that this tutorial breaks at some point, please let us know by opening an issue. \n", + "\n", + "Let's get started!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "pip install senselab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading audio clips from disk:\n", + "Need to read some audio files from disk? **EASY!**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from senselab.audio.data_structures.audio import Audio\n", + "\n", + "MONO_AUDIO_PATH = \"../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\"\n", + "STEREO_AUDIO_PATH = \"../src/tests/data_for_testing/audio_48khz_stereo_16bits.wav\"\n", + "\n", + "audio1 = Audio.from_filepath(MONO_AUDIO_PATH)\n", + "audio2 = Audio.from_filepath(STEREO_AUDIO_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Downmixing audio clips to mono\n", + "Want to downmix your audio to mono? It has neve been that **EASY!**! Here\u2019s how:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The original audio has 2 channels.\n", + "The downmixed audio has 1 channels.\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.preprocessing.preprocessing import downmix_audios_to_mono\n", + "\n", + "print(\"The original audio has {} channels.\".format(audio2.waveform.shape[0]))\n", + "audio2 = downmix_audios_to_mono([audio2])[0]\n", + "print(\"The downmixed audio has {} channels.\".format(audio2.waveform.shape[0]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resampling audio clips to 16000 Hz\n", + "Need to resample your audio to 16000 Hz? **EASY!**\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The original audio has a sampling rate of 48000 Hz.\n", + "The resampled audio has a sampling rate of 16000 Hz.\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.preprocessing.preprocessing import resample_audios\n", + "\n", + "print(\"The original audio has a sampling rate of {} Hz.\".format(audio1.sampling_rate))\n", + "[audio1, audio2] = resample_audios([audio1, audio2], resample_rate=16000)\n", + "print(\"The resampled audio has a sampling rate of {} Hz.\".format(audio1.sampling_rate))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Playing and plotting audio\n", + "Want to play or plot your audio? **EASY!**! Here is how:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from senselab.audio.tasks.plotting.plotting import play_audio\n", + "play_audio(audio1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from senselab.audio.tasks.plotting.plotting import plot_waveform\n", + "plot_waveform(audio1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Voice Activity Detection\n", + "Want to detect when someone is speaking? **EASY!**" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Voice activity detection results: [[ScriptLine(text=None, speaker='VOICE', start=0.03096875, end=3.1359687500000004, chunks=None), ScriptLine(text=None, speaker='VOICE', start=3.3553437500000003, end=3.6253437500000003, chunks=None), ScriptLine(text=None, speaker='VOICE', start=3.6253437500000003, end=3.94596875, chunks=None), ScriptLine(text=None, speaker='VOICE', start=3.94596875, end=4.87409375, chunks=None)], [ScriptLine(text=None, speaker='VOICE', start=0.03096875, end=3.0515937500000003, chunks=None), ScriptLine(text=None, speaker='VOICE', start=3.3215937500000003, end=4.890968750000001, chunks=None)]]\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.voice_activity_detection.api import detect_human_voice_activity_in_audios\n", + "from senselab.utils.data_structures.model import PyannoteAudioModel\n", + "\n", + "pyannote_model = PyannoteAudioModel(path_or_uri=\"pyannote/speaker-diarization-3.1\", revision=\"main\")\n", + "voice_activity_results = detect_human_voice_activity_in_audios(audios=[audio1, audio2], model=pyannote_model)\n", + "print(\"Voice activity detection results: {}\".format(voice_activity_results))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Speaker Diarization\n", + "Wondering who is speaking and when? **EASY!**" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diarization results: [[ScriptLine(text=None, speaker='SPEAKER_00', start=0.03096875, end=3.1359687500000004, chunks=None), ScriptLine(text=None, speaker='SPEAKER_00', start=3.3553437500000003, end=3.6253437500000003, chunks=None), ScriptLine(text=None, speaker='SPEAKER_01', start=3.6253437500000003, end=3.94596875, chunks=None), ScriptLine(text=None, speaker='SPEAKER_00', start=3.94596875, end=4.87409375, chunks=None)], [ScriptLine(text=None, speaker='SPEAKER_00', start=0.03096875, end=3.0515937500000003, chunks=None), ScriptLine(text=None, speaker='SPEAKER_01', start=3.3215937500000003, end=4.890968750000001, chunks=None)]]\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.speaker_diarization.api import diarize_audios\n", + "\n", + "pyannote_model = PyannoteAudioModel(path_or_uri=\"pyannote/speaker-diarization-3.1\", revision=\"main\")\n", + "diarization_results = diarize_audios(audios=[audio1, audio2], model=pyannote_model)\n", + "\n", + "print(\"Diarization results: {}\".format(diarization_results))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Automatic Speech Recognition\n", + "Want to convert speech to text? **EASY!**! Use this:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.\n", + "The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "WhisperModel is using WhisperSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation=\"eager\"` when loading the model.\n", + "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transcripts: [ScriptLine(text='This is beautiful. This is Johnny. Kenny. Enjoy. We just wanted to take a minute to thank you.', speaker=None, start=0.0, end=4.9, chunks=[ScriptLine(text='This', speaker=None, start=0.0, end=0.3, chunks=None), ScriptLine(text='is', speaker=None, start=0.3, end=0.5, chunks=None), ScriptLine(text='beautiful.', speaker=None, start=0.5, end=1.04, chunks=None), ScriptLine(text='This', speaker=None, start=1.04, end=1.28, chunks=None), ScriptLine(text='is', speaker=None, start=1.28, end=1.4, chunks=None), ScriptLine(text='Johnny.', speaker=None, start=1.4, end=1.9, chunks=None), ScriptLine(text='Kenny.', speaker=None, start=1.9, end=2.5, chunks=None), ScriptLine(text='Enjoy.', speaker=None, start=2.5, end=3.48, chunks=None), ScriptLine(text='We', speaker=None, start=3.48, end=3.58, chunks=None), ScriptLine(text='just', speaker=None, start=3.58, end=3.74, chunks=None), ScriptLine(text='wanted', speaker=None, start=3.74, end=3.9, chunks=None), ScriptLine(text='to', speaker=None, start=3.9, end=4.06, chunks=None), ScriptLine(text='take', speaker=None, start=4.06, end=4.24, chunks=None), ScriptLine(text='a', speaker=None, start=4.24, end=4.38, chunks=None), ScriptLine(text='minute', speaker=None, start=4.38, end=4.48, chunks=None), ScriptLine(text='to', speaker=None, start=4.48, end=4.66, chunks=None), ScriptLine(text='thank', speaker=None, start=4.66, end=4.9, chunks=None), ScriptLine(text='you.', speaker=None, start=4.9, end=4.9, chunks=None)]), ScriptLine(text='This is Peter. This is Johnny. Kenny. And Joe. We just wanted to take a minute to thank you.', speaker=None, start=0.08, end=4.88, chunks=[ScriptLine(text='This', speaker=None, start=0.08, end=0.3, chunks=None), ScriptLine(text='is', speaker=None, start=0.3, end=0.48, chunks=None), ScriptLine(text='Peter.', speaker=None, start=0.48, end=1.1, chunks=None), ScriptLine(text='This', speaker=None, start=1.1, end=1.18, chunks=None), ScriptLine(text='is', speaker=None, start=1.18, end=1.4, chunks=None), ScriptLine(text='Johnny.', speaker=None, start=1.4, end=2.24, chunks=None), ScriptLine(text='Kenny.', speaker=None, start=2.24, end=2.44, chunks=None), ScriptLine(text='And', speaker=None, start=2.44, end=2.6, chunks=None), ScriptLine(text='Joe.', speaker=None, start=2.6, end=3.5, chunks=None), ScriptLine(text='We', speaker=None, start=3.5, end=3.52, chunks=None), ScriptLine(text='just', speaker=None, start=3.52, end=3.78, chunks=None), ScriptLine(text='wanted', speaker=None, start=3.78, end=3.9, chunks=None), ScriptLine(text='to', speaker=None, start=3.9, end=4.04, chunks=None), ScriptLine(text='take', speaker=None, start=4.04, end=4.24, chunks=None), ScriptLine(text='a', speaker=None, start=4.24, end=4.38, chunks=None), ScriptLine(text='minute', speaker=None, start=4.38, end=4.48, chunks=None), ScriptLine(text='to', speaker=None, start=4.48, end=4.68, chunks=None), ScriptLine(text='thank', speaker=None, start=4.68, end=4.88, chunks=None), ScriptLine(text='you.', speaker=None, start=4.88, end=5.14, chunks=None)])]\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.speech_to_text.api import transcribe_audios\n", + "from senselab.utils.data_structures.model import HFModel\n", + "\n", + "hf_model = HFModel(path_or_uri=\"openai/whisper-tiny\", revision=\"main\")\n", + "transcripts = transcribe_audios(audios=[audio1, audio2], model=hf_model)\n", + "\n", + "print(\"Transcripts: {}\".format(transcripts))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Speaker Embeddings\n", + "Need to get unique speaker signatures? **EASY!** Here\u2019s how:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Speaker embeddings: [tensor([ 13.3957, 26.2355, 37.4537, -2.8282, 8.9507, -29.2735, 7.2133,\n", + " 45.1035, -15.5556, -1.7310, 21.6599, -20.0387, 3.2208, 8.7215,\n", + " -10.7560, 22.5969, 29.0999, -11.3117, -7.1902, 1.2383, -1.0127,\n", + " -2.9714, -9.1717, 2.9307, 14.5644, 22.5480, 32.8797, -8.2324,\n", + " -45.2855, -44.8601, -0.4651, 3.9789, 23.8119, 1.7783, -5.7042,\n", + " -15.5208, 19.8033, -24.0270, 5.0979, -3.6242, -13.6030, 39.9842,\n", + " 13.3587, -3.6292, -7.7488, 15.8596, -11.6165, -11.8744, 6.2166,\n", + " -3.0217, -21.2551, 0.7295, 19.4582, 29.5501, -5.4905, -18.5775,\n", + " -16.1116, -33.7832, -3.8432, 17.6366, -2.1034, 9.4590, -27.4943,\n", + " 0.6271, 28.4789, -10.5076, -11.0000, -6.1025, -16.5245, -10.9806,\n", + " 1.6901, 17.1203, 1.9592, -28.5540, 24.3082, 10.7635, -0.8738,\n", + " -17.3388, -8.4655, 15.3403, 22.4938, -6.3000, 4.0899, 9.6344,\n", + " -1.6355, 22.7877, 11.6864, 28.3855, -8.9496, 31.4788, -38.0877,\n", + " 9.4627, 27.5120, -9.9188, -31.4496, 15.4476, 2.0600, -4.8030,\n", + " -21.5671, 5.3270, 30.7293, -4.6995, 4.1002, -7.2646, 2.5260,\n", + " -10.8024, 1.5449, -5.9423, 8.8494, -6.3957, 10.0073, -25.6943,\n", + " 17.2914, 25.6856, 20.1078, -8.2351, -20.7194, 9.9062, -8.8984,\n", + " 5.9353, -27.8755, -7.2793, -21.9900, -2.7173, -17.6591, 20.7600,\n", + " -13.0104, -18.6974, -6.9513, 17.8341, -23.1684, -4.7947, -14.2303,\n", + " 1.2443, -9.7829, -3.3088, -20.0398, -12.6467, -6.2099, 2.0532,\n", + " 12.2837, 15.2998, -9.5024, -8.2001, 8.7692, 3.6774, 11.5215,\n", + " -1.4098, 0.6301, 1.7540, 11.4052, -6.6771, -8.7574, 35.5005,\n", + " -4.3977, -0.5316, -22.3267, 15.5236, -17.2237, 5.5499, 11.2341,\n", + " 16.1329, -9.3619, -13.6780, 1.3820, 15.5668, -0.2123, 8.9507,\n", + " 7.4602, -12.7370, 10.6937, -11.4908, 12.6570, -24.1560, 1.6060,\n", + " -18.6128, -4.3532, 12.1146, 13.4223, 16.2724, -14.1925, 28.3128,\n", + " 28.5381, -7.4341, -28.3595, -0.2795, 4.3653, -6.5646, -25.4785,\n", + " -11.7617, 19.1149, -15.8362]), tensor([ 13.3957, 26.2355, 37.4537, -2.8282, 8.9507, -29.2735, 7.2133,\n", + " 45.1035, -15.5556, -1.7310, 21.6599, -20.0387, 3.2208, 8.7215,\n", + " -10.7560, 22.5969, 29.0999, -11.3117, -7.1902, 1.2383, -1.0127,\n", + " -2.9714, -9.1717, 2.9307, 14.5644, 22.5480, 32.8797, -8.2324,\n", + " -45.2855, -44.8601, -0.4651, 3.9789, 23.8119, 1.7783, -5.7042,\n", + " -15.5208, 19.8033, -24.0270, 5.0979, -3.6242, -13.6030, 39.9842,\n", + " 13.3587, -3.6292, -7.7488, 15.8596, -11.6165, -11.8744, 6.2166,\n", + " -3.0217, -21.2551, 0.7295, 19.4582, 29.5501, -5.4905, -18.5775,\n", + " -16.1116, -33.7832, -3.8432, 17.6366, -2.1034, 9.4590, -27.4943,\n", + " 0.6271, 28.4789, -10.5076, -11.0000, -6.1025, -16.5245, -10.9806,\n", + " 1.6901, 17.1203, 1.9592, -28.5540, 24.3082, 10.7635, -0.8738,\n", + " -17.3388, -8.4655, 15.3403, 22.4938, -6.3000, 4.0899, 9.6344,\n", + " -1.6355, 22.7877, 11.6864, 28.3855, -8.9496, 31.4788, -38.0877,\n", + " 9.4627, 27.5120, -9.9188, -31.4496, 15.4476, 2.0600, -4.8030,\n", + " -21.5671, 5.3270, 30.7293, -4.6995, 4.1002, -7.2646, 2.5260,\n", + " -10.8024, 1.5449, -5.9423, 8.8494, -6.3957, 10.0073, -25.6943,\n", + " 17.2914, 25.6856, 20.1078, -8.2351, -20.7194, 9.9062, -8.8984,\n", + " 5.9353, -27.8755, -7.2793, -21.9900, -2.7173, -17.6591, 20.7600,\n", + " -13.0104, -18.6974, -6.9513, 17.8341, -23.1684, -4.7947, -14.2303,\n", + " 1.2443, -9.7829, -3.3088, -20.0398, -12.6467, -6.2099, 2.0532,\n", + " 12.2837, 15.2998, -9.5024, -8.2001, 8.7692, 3.6774, 11.5215,\n", + " -1.4098, 0.6301, 1.7540, 11.4052, -6.6771, -8.7574, 35.5005,\n", + " -4.3977, -0.5316, -22.3267, 15.5236, -17.2237, 5.5499, 11.2341,\n", + " 16.1329, -9.3619, -13.6780, 1.3820, 15.5668, -0.2123, 8.9507,\n", + " 7.4602, -12.7370, 10.6937, -11.4908, 12.6570, -24.1560, 1.6060,\n", + " -18.6128, -4.3532, 12.1146, 13.4223, 16.2724, -14.1925, 28.3128,\n", + " 28.5381, -7.4341, -28.3595, -0.2795, 4.3653, -6.5646, -25.4785,\n", + " -11.7617, 19.1149, -15.8362])]\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.speaker_embeddings.api import extract_speaker_embeddings_from_audios\n", + "from senselab.utils.data_structures.model import SpeechBrainModel\n", + "\n", + "ecapa_model = SpeechBrainModel(path_or_uri=\"speechbrain/spkrec-ecapa-voxceleb\", revision=\"main\")\n", + "embeddings = extract_speaker_embeddings_from_audios(audios=[audio1, audio1], model=ecapa_model)\n", + "\n", + "print(\"Speaker embeddings: {}\".format(embeddings))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Speech Emotion Recognition\n", + "Want to know the emotions in the speech? **EASY!**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']\n", + "- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Emotion results: [('neutral', {'neutral': 0.14024856686592102, 'angry': 0.1284991055727005, 'calm': 0.12721076607704163, 'happy': 0.12397731840610504, 'disgust': 0.12309260666370392}), ('neutral', {'neutral': 0.1407526433467865, 'angry': 0.1282917857170105, 'calm': 0.12782087922096252, 'disgust': 0.12420850247144699, 'happy': 0.12152237445116043})]\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.classification.speech_emotion_recognition import speech_emotion_recognition_with_hf_models\n", + "\n", + "emotion_model = HFModel(path_or_uri=\"ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition\")\n", + "emotion_results = speech_emotion_recognition_with_hf_models([audio1, audio2], emotion_model)\n", + "\n", + "print(\"Emotion results: {}\".format(emotion_results))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Audio Augmentation\n", + "Need to augment your audio data? **EASY!**! Here\u2019s how:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Augmented audio: waveform=tensor([[1.0264e-04, 7.8498e-05, 1.1372e-04, ..., -0.0000e+00, -0.0000e+00,\n", + " -0.0000e+00]]) sampling_rate=16000 orig_path_or_id='../src/tests/data_for_testing/audio_48khz_mono_16bits.wav' metadata={}\n" + ] + } + ], + "source": [ + "from torch_audiomentations import Compose, PolarityInversion\n", + "from senselab.audio.tasks.data_augmentation.data_augmentation import augment_audios\n", + "\n", + "apply_augmentation = Compose(transforms=[PolarityInversion(p=1, output_type=\"dict\")], output_type=\"dict\")\n", + "[augmented_audio1, augmented_audio2] = augment_audios([audio1, audio2], apply_augmentation)\n", + "\n", + "print(\"Augmented audio: {}\".format(augmented_audio1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Extraction\n", + "Want to extract some OPENSMILE features from audio? **EASY!**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenSMILE features: [{'F0semitoneFrom27.5Hz_sma3nz_amean': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 0.0, 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 0.0, 'loudness_sma3_amean': 4.665394306182861, 'loudness_sma3_stddevNorm': 0.5007734298706055, 'loudness_sma3_percentile20.0': 1.9130202531814575, 'loudness_sma3_percentile50.0': 5.763050079345703, 'loudness_sma3_percentile80.0': 6.902288436889648, 'loudness_sma3_pctlrange0-2': 4.9892683029174805, 'loudness_sma3_meanRisingSlope': 48.17048645019531, 'loudness_sma3_stddevRisingSlope': 29.044353485107422, 'loudness_sma3_meanFallingSlope': 34.63795471191406, 'loudness_sma3_stddevFallingSlope': 30.348445892333984, 'spectralFlux_sma3_amean': 3.0373308658599854, 'spectralFlux_sma3_stddevNorm': 0.5504347085952759, 'mfcc1_sma3_amean': -7.3260178565979, 'mfcc1_sma3_stddevNorm': -1.2529454231262207, 'mfcc2_sma3_amean': 1.0084701776504517, 'mfcc2_sma3_stddevNorm': 8.086338996887207, 'mfcc3_sma3_amean': 1.6081973314285278, 'mfcc3_sma3_stddevNorm': 4.823134899139404, 'mfcc4_sma3_amean': 0.7950991988182068, 'mfcc4_sma3_stddevNorm': 6.340704917907715, 'jitterLocal_sma3nz_amean': 0.0, 'jitterLocal_sma3nz_stddevNorm': 0.0, 'shimmerLocaldB_sma3nz_amean': 0.0, 'shimmerLocaldB_sma3nz_stddevNorm': 0.0, 'HNRdBACF_sma3nz_amean': 0.0, 'HNRdBACF_sma3nz_stddevNorm': 0.0, 'logRelF0-H1-H2_sma3nz_amean': 0.0, 'logRelF0-H1-H2_sma3nz_stddevNorm': 0.0, 'logRelF0-H1-A3_sma3nz_amean': 0.0, 'logRelF0-H1-A3_sma3nz_stddevNorm': 0.0, 'F1frequency_sma3nz_amean': 0.0, 'F1frequency_sma3nz_stddevNorm': 0.0, 'F1bandwidth_sma3nz_amean': 0.0, 'F1bandwidth_sma3nz_stddevNorm': 0.0, 'F1amplitudeLogRelF0_sma3nz_amean': -201.0, 'F1amplitudeLogRelF0_sma3nz_stddevNorm': 0.0, 'F2frequency_sma3nz_amean': 0.0, 'F2frequency_sma3nz_stddevNorm': 0.0, 'F2bandwidth_sma3nz_amean': 0.0, 'F2bandwidth_sma3nz_stddevNorm': 0.0, 'F2amplitudeLogRelF0_sma3nz_amean': -201.0, 'F2amplitudeLogRelF0_sma3nz_stddevNorm': 0.0, 'F3frequency_sma3nz_amean': 0.0, 'F3frequency_sma3nz_stddevNorm': 0.0, 'F3bandwidth_sma3nz_amean': 0.0, 'F3bandwidth_sma3nz_stddevNorm': 0.0, 'F3amplitudeLogRelF0_sma3nz_amean': -201.0, 'F3amplitudeLogRelF0_sma3nz_stddevNorm': 0.0, 'alphaRatioV_sma3nz_amean': 0.0, 'alphaRatioV_sma3nz_stddevNorm': 0.0, 'hammarbergIndexV_sma3nz_amean': 0.0, 'hammarbergIndexV_sma3nz_stddevNorm': 0.0, 'slopeV0-500_sma3nz_amean': 0.0, 'slopeV0-500_sma3nz_stddevNorm': 0.0, 'slopeV500-1500_sma3nz_amean': 0.0, 'slopeV500-1500_sma3nz_stddevNorm': 0.0, 'spectralFluxV_sma3nz_amean': 0.0, 'spectralFluxV_sma3nz_stddevNorm': 0.0, 'mfcc1V_sma3nz_amean': 0.0, 'mfcc1V_sma3nz_stddevNorm': 0.0, 'mfcc2V_sma3nz_amean': 0.0, 'mfcc2V_sma3nz_stddevNorm': 0.0, 'mfcc3V_sma3nz_amean': 0.0, 'mfcc3V_sma3nz_stddevNorm': 0.0, 'mfcc4V_sma3nz_amean': 0.0, 'mfcc4V_sma3nz_stddevNorm': 0.0, 'alphaRatioUV_sma3nz_amean': 3.881474018096924, 'hammarbergIndexUV_sma3nz_amean': 3.0704212188720703, 'slopeUV0-500_sma3nz_amean': -0.011536612175405025, 'slopeUV500-1500_sma3nz_amean': -0.004415606148540974, 'spectralFluxUV_sma3nz_amean': 3.0689496994018555, 'loudnessPeaksPerSec': 5.091650009155273, 'VoicedSegmentsPerSec': 0.0, 'MeanVoicedSegmentLengthSec': 0.0, 'StddevVoicedSegmentLengthSec': 0.0, 'MeanUnvoicedSegmentLength': 4.849999904632568, 'StddevUnvoicedSegmentLength': 0.0, 'equivalentSoundLevel_dBp': -7.12342643737793}, {'F0semitoneFrom27.5Hz_sma3nz_amean': 25.710796356201172, 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': 0.1605353206396103, 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0': 21.095951080322266, 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0': 25.9762020111084, 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0': 29.512413024902344, 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2': 8.416461944580078, 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope': 82.34796905517578, 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope': 99.20043182373047, 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope': 22.002275466918945, 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 9.043970108032227, 'loudness_sma3_amean': 0.8608756065368652, 'loudness_sma3_stddevNorm': 0.43875232338905334, 'loudness_sma3_percentile20.0': 0.5877408981323242, 'loudness_sma3_percentile50.0': 0.8352401852607727, 'loudness_sma3_percentile80.0': 1.1747918128967285, 'loudness_sma3_pctlrange0-2': 0.5870509147644043, 'loudness_sma3_meanRisingSlope': 10.285205841064453, 'loudness_sma3_stddevRisingSlope': 7.544795513153076, 'loudness_sma3_meanFallingSlope': 7.612530708312988, 'loudness_sma3_stddevFallingSlope': 4.159041404724121, 'spectralFlux_sma3_amean': 0.3213598430156708, 'spectralFlux_sma3_stddevNorm': 0.6921582221984863, 'mfcc1_sma3_amean': 10.274803161621094, 'mfcc1_sma3_stddevNorm': 1.1581648588180542, 'mfcc2_sma3_amean': 4.262022018432617, 'mfcc2_sma3_stddevNorm': 2.0523014068603516, 'mfcc3_sma3_amean': 7.624594211578369, 'mfcc3_sma3_stddevNorm': 1.4570356607437134, 'mfcc4_sma3_amean': 3.667618751525879, 'mfcc4_sma3_stddevNorm': 2.69022274017334, 'jitterLocal_sma3nz_amean': 0.019597545266151428, 'jitterLocal_sma3nz_stddevNorm': 0.9063855409622192, 'shimmerLocaldB_sma3nz_amean': 1.264746069908142, 'shimmerLocaldB_sma3nz_stddevNorm': 0.4629262685775757, 'HNRdBACF_sma3nz_amean': 3.6400070190429688, 'HNRdBACF_sma3nz_stddevNorm': 0.5911335945129395, 'logRelF0-H1-H2_sma3nz_amean': 1.2158769369125366, 'logRelF0-H1-H2_sma3nz_stddevNorm': 3.8838469982147217, 'logRelF0-H1-A3_sma3nz_amean': 18.83077621459961, 'logRelF0-H1-A3_sma3nz_stddevNorm': 0.3087078630924225, 'F1frequency_sma3nz_amean': 665.1737670898438, 'F1frequency_sma3nz_stddevNorm': 0.4195936620235443, 'F1bandwidth_sma3nz_amean': 1300.4451904296875, 'F1bandwidth_sma3nz_stddevNorm': 0.16353538632392883, 'F1amplitudeLogRelF0_sma3nz_amean': -132.1533660888672, 'F1amplitudeLogRelF0_sma3nz_stddevNorm': -0.6691396832466125, 'F2frequency_sma3nz_amean': 1657.174560546875, 'F2frequency_sma3nz_stddevNorm': 0.17014622688293457, 'F2bandwidth_sma3nz_amean': 1105.97900390625, 'F2bandwidth_sma3nz_stddevNorm': 0.24582387506961823, 'F2amplitudeLogRelF0_sma3nz_amean': -132.76707458496094, 'F2amplitudeLogRelF0_sma3nz_stddevNorm': -0.6468541026115417, 'F3frequency_sma3nz_amean': 2601.5419921875, 'F3frequency_sma3nz_stddevNorm': 0.11460811644792557, 'F3bandwidth_sma3nz_amean': 1091.6160888671875, 'F3bandwidth_sma3nz_stddevNorm': 0.37865355610847473, 'F3amplitudeLogRelF0_sma3nz_amean': -134.5210723876953, 'F3amplitudeLogRelF0_sma3nz_stddevNorm': -0.6203084588050842, 'alphaRatioV_sma3nz_amean': -8.626544952392578, 'alphaRatioV_sma3nz_stddevNorm': -0.4953792989253998, 'hammarbergIndexV_sma3nz_amean': 16.796852111816406, 'hammarbergIndexV_sma3nz_stddevNorm': 0.3567315638065338, 'slopeV0-500_sma3nz_amean': 0.021949267014861107, 'slopeV0-500_sma3nz_stddevNorm': 1.0097254514694214, 'slopeV500-1500_sma3nz_amean': -0.008139749057590961, 'slopeV500-1500_sma3nz_stddevNorm': -1.6243412494659424, 'spectralFluxV_sma3nz_amean': 0.4831695556640625, 'spectralFluxV_sma3nz_stddevNorm': 0.48576220870018005, 'mfcc1V_sma3nz_amean': 20.25444793701172, 'mfcc1V_sma3nz_stddevNorm': 0.44413772225379944, 'mfcc2V_sma3nz_amean': 3.6194145679473877, 'mfcc2V_sma3nz_stddevNorm': 2.17659068107605, 'mfcc3V_sma3nz_amean': 7.736477375030518, 'mfcc3V_sma3nz_stddevNorm': 1.8631006479263306, 'mfcc4V_sma3nz_amean': 4.605042934417725, 'mfcc4V_sma3nz_stddevNorm': 2.8646581172943115, 'alphaRatioUV_sma3nz_amean': -2.5990116596221924, 'hammarbergIndexUV_sma3nz_amean': 8.86290168762207, 'slopeUV0-500_sma3nz_amean': 0.002166706370189786, 'slopeUV500-1500_sma3nz_amean': 0.00673573836684227, 'spectralFluxUV_sma3nz_amean': 0.24703539907932281, 'loudnessPeaksPerSec': 3.8834950923919678, 'VoicedSegmentsPerSec': 2.745098114013672, 'MeanVoicedSegmentLengthSec': 0.12214285880327225, 'StddevVoicedSegmentLengthSec': 0.09025190770626068, 'MeanUnvoicedSegmentLength': 0.20666664838790894, 'StddevUnvoicedSegmentLength': 0.17666037380695343, 'equivalentSoundLevel_dBp': -24.297258377075195}]\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.features_extraction.opensmile import extract_opensmile_features_from_audios\n", + "\n", + "features = extract_opensmile_features_from_audios([audio1, audio2])\n", + "\n", + "print(\"OpenSMILE features: {}\".format(features))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Speech Enhancement\n", + "Need to clean up your audio? **EASY!** Here\u2019s how:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Enhanced audios: [Audio(waveform=tensor([[0.0026, 0.0027, 0.0041, ..., 0.0000, 0.0000, 0.0000]]), sampling_rate=16000, orig_path_or_id='../src/tests/data_for_testing/audio_48khz_mono_16bits.wav', metadata={}), Audio(waveform=tensor([[0.0026, 0.0027, 0.0041, ..., 0.0000, 0.0000, 0.0000]]), sampling_rate=16000, orig_path_or_id='../src/tests/data_for_testing/audio_48khz_mono_16bits.wav', metadata={})]\n" + ] + } + ], + "source": [ + "from senselab.audio.tasks.speech_enhancement.api import enhance_audios\n", + "from senselab.utils.data_structures.model import SpeechBrainModel\n", + "\n", + "speechbrain_model = SpeechBrainModel(path_or_uri=\"speechbrain/sepformer-wham16k-enhancement\", revision=\"main\")\n", + "enhanced_audios = enhance_audios(audios=[audio1, audio1], model=speechbrain_model)\n", + "\n", + "print(\"Enhanced audios: {}\".format(enhanced_audios))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "senselab-UNCffeRf-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}