Skip to content

Commit

Permalink
Use Silero VAD in Batched Mode (#936)
Browse files Browse the repository at this point in the history
Replace Pyannote VAD with Silero to reduce code duplication and requirements
  • Loading branch information
MahmoudAshraf97 authored Oct 24, 2024
1 parent 574e256 commit 2dbca5e
Show file tree
Hide file tree
Showing 12 changed files with 277 additions and 508 deletions.
4 changes: 2 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
include faster_whisper/assets/silero_vad.onnx
include faster_whisper/assets/silero_encoder_v5.onnx
include faster_whisper/assets/silero_decoder_v5.onnx
include requirements.txt
include requirements.conversion.txt
include faster_whisper/assets/pyannote_vad_model.bin
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,6 @@ language_info = model.detect_language_multi_segment("audio.mp3")

### Batched faster-whisper


The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference.

The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.

```python
Expand Down
83 changes: 83 additions & 0 deletions benchmark/evaluate_yt_commons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import argparse
import json
import os

from io import BytesIO

from datasets import load_dataset
from evaluate import load
from pytubefix import YouTube
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer

from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio


def url_to_audio(row):
buffer = BytesIO()
yt = YouTube(row["link"])
video = (
yt.streams.filter(only_audio=True, mime_type="audio/mp4")
.order_by("bitrate")
.desc()
.first()
)
video.stream_to_buffer(buffer)
buffer.seek(0)
row["audio"] = decode_audio(buffer)
return row


parser = argparse.ArgumentParser(description="WER benchmark")
parser.add_argument(
"--audio_numb",
type=int,
default=None,
help="Specify the number of validation audio files in the dataset."
" Set to None to retrieve all audio files.",
)
args = parser.parse_args()

# define the evaluation metric
wer_metric = load("wer")

with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
normalizer = EnglishTextNormalizer(json.load(f))

dataset = load_dataset("mobiuslabsgmbh/youtube-commons-asr-eval", streaming=True).map(
url_to_audio
)
dataset = iter(
DataLoader(dataset["test"], batch_size=1, prefetch_factor=4, num_workers=2)
)

model = WhisperModel("large-v3", device="cuda")
pipeline = BatchedInferencePipeline(model, device="cuda")


all_transcriptions = []
all_references = []
# iterate over the dataset and run inference
for i, row in tqdm(enumerate(dataset), desc="Evaluating..."):
result, info = pipeline.transcribe(
row["audio"][0],
batch_size=8,
word_timestamps=False,
without_timestamps=True,
)

all_transcriptions.append("".join(segment.text for segment in result))
all_references.append(row["text"][0])
if args.audio_numb and i == (args.audio_numb - 1):
break

# normalize predictions and references
all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions]
all_references = [normalizer(reference) for reference in all_references]

# compute the WER metric
wer = 100 * wer_metric.compute(
predictions=all_transcriptions, references=all_references
)
print("WER: %.3f" % wer)
1 change: 1 addition & 0 deletions benchmark/requirements.benchmark.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ evaluate
datasets
memory_profiler
py3nvml
pytubefix
Binary file removed faster_whisper/assets/pyannote_vad_model.bin
Binary file not shown.
Binary file added faster_whisper/assets/silero_decoder_v5.onnx
Binary file not shown.
Binary file added faster_whisper/assets/silero_encoder_v5.onnx
Binary file not shown.
Binary file removed faster_whisper/assets/silero_vad.onnx
Binary file not shown.
Loading

0 comments on commit 2dbca5e

Please sign in to comment.