Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update upstream fork changes #1068

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
include faster_whisper/assets/silero_vad.onnx
include faster_whisper/assets/silero_encoder_v5.onnx
include faster_whisper/assets/silero_decoder_v5.onnx
include requirements.txt
include requirements.conversion.txt
include faster_whisper/assets/pyannote_vad_model.bin
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,6 @@ language_info = model.detect_language_multi_segment("audio.mp3")

### Batched faster-whisper


The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference.

The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.

```python
Expand Down
83 changes: 83 additions & 0 deletions benchmark/evaluate_yt_commons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import argparse
import json
import os

from io import BytesIO

from datasets import load_dataset
from evaluate import load
from pytubefix import YouTube
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer

from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio


def url_to_audio(row):
buffer = BytesIO()
yt = YouTube(row["link"])
video = (
yt.streams.filter(only_audio=True, mime_type="audio/mp4")
.order_by("bitrate")
.desc()
.first()
)
video.stream_to_buffer(buffer)
buffer.seek(0)
row["audio"] = decode_audio(buffer)
return row


parser = argparse.ArgumentParser(description="WER benchmark")
parser.add_argument(
"--audio_numb",
type=int,
default=None,
help="Specify the number of validation audio files in the dataset."
" Set to None to retrieve all audio files.",
)
args = parser.parse_args()

# define the evaluation metric
wer_metric = load("wer")

with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
normalizer = EnglishTextNormalizer(json.load(f))

dataset = load_dataset("mobiuslabsgmbh/youtube-commons-asr-eval", streaming=True).map(
url_to_audio
)
dataset = iter(
DataLoader(dataset["test"], batch_size=1, prefetch_factor=4, num_workers=2)
)

model = WhisperModel("large-v3", device="cuda")
pipeline = BatchedInferencePipeline(model, device="cuda")


all_transcriptions = []
all_references = []
# iterate over the dataset and run inference
for i, row in tqdm(enumerate(dataset), desc="Evaluating..."):
result, info = pipeline.transcribe(
row["audio"][0],
batch_size=8,
word_timestamps=False,
without_timestamps=True,
)

all_transcriptions.append("".join(segment.text for segment in result))
all_references.append(row["text"][0])
if args.audio_numb and i == (args.audio_numb - 1):
break

# normalize predictions and references
all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions]
all_references = [normalizer(reference) for reference in all_references]

# compute the WER metric
wer = 100 * wer_metric.compute(
predictions=all_transcriptions, references=all_references
)
print("WER: %.3f" % wer)
1 change: 1 addition & 0 deletions benchmark/requirements.benchmark.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ evaluate
datasets
memory_profiler
py3nvml
pytubefix
Binary file removed faster_whisper/assets/pyannote_vad_model.bin
Binary file not shown.
Binary file added faster_whisper/assets/silero_decoder_v5.onnx
Binary file not shown.
Binary file added faster_whisper/assets/silero_encoder_v5.onnx
Binary file not shown.
Binary file removed faster_whisper/assets/silero_vad.onnx
Binary file not shown.
88 changes: 79 additions & 9 deletions faster_whisper/audio.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV

The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
system dependencies. FFmpeg does not need to be installed on the system.

However, the API is quite low-level so we need to manipulate audio frames directly.
"""

import gc
import io
import itertools

from typing import BinaryIO, Union

import av
import numpy as np
import torch
import torchaudio


def decode_audio(
Expand All @@ -17,22 +30,79 @@ def decode_audio(
split_stereo: Return separate left and right channels.

Returns:
A float32 Torch Tensor.
A float32 Numpy array.

If `split_stereo` is enabled, the function returns a 2-tuple with the
separated left and right channels.
"""
resampler = av.audio.resampler.AudioResampler(
format="s16",
layout="mono" if not split_stereo else "stereo",
rate=sampling_rate,
)

waveform, audio_sf = torchaudio.load(input_file) # waveform: channels X T
raw_buffer = io.BytesIO()
dtype = None

with av.open(input_file, mode="r", metadata_errors="ignore") as container:
frames = container.decode(audio=0)
frames = _ignore_invalid_frames(frames)
frames = _group_frames(frames, 500000)
frames = _resample_frames(frames, resampler)

for frame in frames:
array = frame.to_ndarray()
dtype = array.dtype
raw_buffer.write(array)

# It appears that some objects related to the resampler are not freed
# unless the garbage collector is manually run.
del resampler
gc.collect()

audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)

# Convert s16 back to f32.
audio = audio.astype(np.float32) / 32768.0

if audio_sf != sampling_rate:
waveform = torchaudio.functional.resample(
waveform, orig_freq=audio_sf, new_freq=sampling_rate
)
if split_stereo:
return waveform[0], waveform[1]
left_channel = audio[0::2]
right_channel = audio[1::2]
return torch.from_numpy(left_channel), torch.from_numpy(right_channel)

return torch.from_numpy(audio)


def _ignore_invalid_frames(frames):
iterator = iter(frames)

while True:
try:
yield next(iterator)
except StopIteration:
break
except av.error.InvalidDataError:
continue


def _group_frames(frames, num_samples=None):
fifo = av.audio.fifo.AudioFifo()

for frame in frames:
frame.pts = None # Ignore timestamp check.
fifo.write(frame)

if num_samples is not None and fifo.samples >= num_samples:
yield fifo.read()

if fifo.samples > 0:
yield fifo.read()


return waveform.mean(0)
def _resample_frames(frames, resampler):
# Add None to flush the resampler.
for frame in itertools.chain(frames, [None]):
yield from resampler.resample(frame)


def pad_or_trim(array, length: int, *, axis: int = -1):
Expand Down
Loading
Loading