SYSTRAN · zazpie · Jul 21, 2024 · Jul 25, 2024 · Jul 26, 2024 · Jul 28, 2024
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,4 @@
-include faster_whisper/assets/silero_vad.onnx
+include faster_whisper/assets/silero_encoder_v5.onnx
+include faster_whisper/assets/silero_decoder_v5.onnx
 include requirements.txt
 include requirements.conversion.txt
-include faster_whisper/assets/pyannote_vad_model.bin
diff --git a/README.md b/README.md
@@ -178,9 +178,6 @@ language_info = model.detect_language_multi_segment("audio.mp3")
 
 ### Batched faster-whisper
 
-
-The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference. 
-
 The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
 
 ```python

diff --git a/benchmark/evaluate_yt_commons.py b/benchmark/evaluate_yt_commons.py
@@ -0,0 +1,83 @@
+import argparse
+import json
+import os
+
+from io import BytesIO
+
+from datasets import load_dataset
+from evaluate import load
+from pytubefix import YouTube
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
+
+from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio
+
+
+def url_to_audio(row):
+    buffer = BytesIO()
+    yt = YouTube(row["link"])
+    video = (
+        yt.streams.filter(only_audio=True, mime_type="audio/mp4")
+        .order_by("bitrate")
+        .desc()
+        .first()
+    )
+    video.stream_to_buffer(buffer)
+    buffer.seek(0)
+    row["audio"] = decode_audio(buffer)
+    return row
+
+
+parser = argparse.ArgumentParser(description="WER benchmark")
+parser.add_argument(
+    "--audio_numb",
+    type=int,
+    default=None,
+    help="Specify the number of validation audio files in the dataset."
+    " Set to None to retrieve all audio files.",
+)
+args = parser.parse_args()
+
+# define the evaluation metric
+wer_metric = load("wer")
+
+with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
+    normalizer = EnglishTextNormalizer(json.load(f))
+
+dataset = load_dataset("mobiuslabsgmbh/youtube-commons-asr-eval", streaming=True).map(
+    url_to_audio
+)
+dataset = iter(
+    DataLoader(dataset["test"], batch_size=1, prefetch_factor=4, num_workers=2)
+)
+
+model = WhisperModel("large-v3", device="cuda")
+pipeline = BatchedInferencePipeline(model, device="cuda")
+
+
+all_transcriptions = []
+all_references = []
+# iterate over the dataset and run inference
+for i, row in tqdm(enumerate(dataset), desc="Evaluating..."):
+    result, info = pipeline.transcribe(
+        row["audio"][0],
+        batch_size=8,
+        word_timestamps=False,
+        without_timestamps=True,
+    )
+
+    all_transcriptions.append("".join(segment.text for segment in result))
+    all_references.append(row["text"][0])
+    if args.audio_numb and i == (args.audio_numb - 1):
+        break
+
+# normalize predictions and references
+all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions]
+all_references = [normalizer(reference) for reference in all_references]
+
+# compute the WER metric
+wer = 100 * wer_metric.compute(
+    predictions=all_transcriptions, references=all_references
+)
+print("WER: %.3f" % wer)
diff --git a/benchmark/requirements.benchmark.txt b/benchmark/requirements.benchmark.txt
@@ -4,3 +4,4 @@ evaluate
 datasets
 memory_profiler
 py3nvml
+pytubefix
diff --git a/faster_whisper/assets/pyannote_vad_model.bin b/faster_whisper/assets/pyannote_vad_model.bin
diff --git a/faster_whisper/assets/silero_decoder_v5.onnx b/faster_whisper/assets/silero_decoder_v5.onnx
diff --git a/faster_whisper/assets/silero_encoder_v5.onnx b/faster_whisper/assets/silero_encoder_v5.onnx
diff --git a/faster_whisper/assets/silero_vad.onnx b/faster_whisper/assets/silero_vad.onnx
diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py
@@ -1,7 +1,20 @@
+"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
+
+The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
+system dependencies. FFmpeg does not need to be installed on the system.
+
+However, the API is quite low-level so we need to manipulate audio frames directly.
+"""
+
+import gc
+import io
+import itertools
+
 from typing import BinaryIO, Union
 
+import av
+import numpy as np
 import torch
-import torchaudio
 
 
 def decode_audio(
@@ -17,22 +30,79 @@ def decode_audio(
       split_stereo: Return separate left and right channels.
 
     Returns:
-      A float32 Torch Tensor.
+      A float32 Numpy array.
 
       If `split_stereo` is enabled, the function returns a 2-tuple with the
       separated left and right channels.
     """
+    resampler = av.audio.resampler.AudioResampler(
+        format="s16",
+        layout="mono" if not split_stereo else "stereo",
+        rate=sampling_rate,
+    )
 
-    waveform, audio_sf = torchaudio.load(input_file)  # waveform: channels X T
+    raw_buffer = io.BytesIO()
+    dtype = None
+
+    with av.open(input_file, mode="r", metadata_errors="ignore") as container:
+        frames = container.decode(audio=0)
+        frames = _ignore_invalid_frames(frames)
+        frames = _group_frames(frames, 500000)
+        frames = _resample_frames(frames, resampler)
+
+        for frame in frames:
+            array = frame.to_ndarray()
+            dtype = array.dtype
+            raw_buffer.write(array)
+
+    # It appears that some objects related to the resampler are not freed
+    # unless the garbage collector is manually run.
+    del resampler
+    gc.collect()
+
+    audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
+
+    # Convert s16 back to f32.
+    audio = audio.astype(np.float32) / 32768.0
 
-    if audio_sf != sampling_rate:
-        waveform = torchaudio.functional.resample(
-            waveform, orig_freq=audio_sf, new_freq=sampling_rate
-        )
     if split_stereo:
-        return waveform[0], waveform[1]
+        left_channel = audio[0::2]
+        right_channel = audio[1::2]
+        return torch.from_numpy(left_channel), torch.from_numpy(right_channel)
+
+    return torch.from_numpy(audio)
+
+
+def _ignore_invalid_frames(frames):
+    iterator = iter(frames)
+
+    while True:
+        try:
+            yield next(iterator)
+        except StopIteration:
+            break
+        except av.error.InvalidDataError:
+            continue
+
+
+def _group_frames(frames, num_samples=None):
+    fifo = av.audio.fifo.AudioFifo()
+
+    for frame in frames:
+        frame.pts = None  # Ignore timestamp check.
+        fifo.write(frame)
+
+        if num_samples is not None and fifo.samples >= num_samples:
+            yield fifo.read()
+
+    if fifo.samples > 0:
+        yield fifo.read()
+
 
-    return waveform.mean(0)
+def _resample_frames(frames, resampler):
+    # Add None to flush the resampler.
+    for frame in itertools.chain(frames, [None]):
+        yield from resampler.resample(frame)
 
 
 def pad_or_trim(array, length: int, *, axis: int = -1):