From 91e866b2bab32897b626e2132631d7db19aff8fc Mon Sep 17 00:00:00 2001 From: Aman Jain Date: Sat, 26 Oct 2024 13:03:59 +0530 Subject: [PATCH] Add modal integration for STT model --- README.md | 321 ++++---------------------------------------- app.py | 77 +++++++++++ download_weights.py | 4 + helper.sh | 10 ++ requirements.txt | 7 +- 5 files changed, 125 insertions(+), 294 deletions(-) create mode 100644 app.py create mode 100644 download_weights.py create mode 100644 helper.sh diff --git a/README.md b/README.md index 7cb96056..a42591f4 100644 --- a/README.md +++ b/README.md @@ -1,316 +1,51 @@ -[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper) +# Large V3 Faster Whisper Modal Deployment On Modal.com -# Faster Whisper transcription with CTranslate2 +A FastAPI-based server that uses [Faster Whisper](https://github.com/guillaumekln/faster-whisper) for speech-to-text transcription, deployed on [modal.com](https://modal.com). This guide walks you through cloning, setting up, and deploying the server. -**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models. +--- -This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. +## Prerequisites -## Benchmark +- **Python 3.x** +- **[Modal Account](https://modal.com)** for deployment -### Whisper +--- -For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations: +## Installation Guide -* [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258) -* [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362) -* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e) +### 1. Clone the Repository -### Large-v2 model on GPU - -| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory | -| --- | --- | --- | --- | --- | --- | -| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB | -| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB | -| faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB | - -*Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.* - -### Small model on CPU - -| Implementation | Precision | Beam size | Time | Max. memory | -| --- | --- | --- | --- | --- | -| openai/whisper | fp32 | 5 | 10m31s | 3101MB | -| whisper.cpp | fp32 | 5 | 17m42s | 1581MB | -| whisper.cpp | fp16 | 5 | 12m39s | 873MB | -| faster-whisper | fp32 | 5 | 2m44s | 1675MB | -| faster-whisper | int8 | 5 | 2m04s | 995MB | - -*Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.* - - -### Distil-whisper - -| Implementation | Precision | Beam size | Time | Gigaspeech WER | -| --- | --- | --- | --- | --- | -| distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 | -| [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 | -| distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 | -| [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 | - -*Executed with CUDA 11.4 on a NVIDIA 3090.* - -
-testing details (click to expand) - -For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting: -```python -from faster_whisper import WhisperModel - -model_size = "distil-large-v2" -# model_size = "distil-medium.en" -# Run on GPU with FP16 -model = WhisperModel(model_size, device="cuda", compute_type="float16") -segments, info = model.transcribe("audio.mp3", beam_size=5, language="en") -``` -
- -## Requirements - -* Python 3.8 or greater - - -### GPU - -GPU execution requires the following NVIDIA libraries to be installed: - -* [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas) -* [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn) - -**Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`). - -There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. - -
-Other installation methods (click to expand) - - -**Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below. - -#### Use Docker - -The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`. - -#### Install with `pip` (Linux only) - -On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python. +Clone the `faster-whisper-modal` repository to your local machine: ```bash -pip install nvidia-cublas-cu12 nvidia-cudnn-cu12 - -export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` -``` - -**Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8. - -#### Download the libraries from Purfview's repository (Windows & Linux) - -Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`. - -
- -## Installation - -The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/): - -```bash -pip install faster-whisper +git clone https://github.com/SYSTRAN/faster-whisper.git +cd faster-whisper-modal ``` -
-Other installation methods (click to expand) - -### Install the master branch - -```bash -pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz" -``` -### Install a specific commit +### 2. Install the Modal SDK +Install the Modal SDK for deploying applications to the Modal cloud: ```bash -pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" -``` - -
- -## Usage - -### Faster-whisper - -```python -from faster_whisper import WhisperModel - -model_size = "large-v3" - -# Run on GPU with FP16 -model = WhisperModel(model_size, device="cuda", compute_type="float16") - -# or run on GPU with INT8 -# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") -# or run on CPU with INT8 -# model = WhisperModel(model_size, device="cpu", compute_type="int8") - -segments, info = model.transcribe("audio.mp3", beam_size=5) - -print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) - -for segment in segments: - print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) -``` - -**Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop: - -```python -segments, _ = model.transcribe("audio.mp3") -segments = list(segments) # The transcription will actually run here. -``` - -### multi-segment language detection - -To directly use the model for improved language detection, the following code snippet can be used: - -```python -from faster_whisper import WhisperModel -model = WhisperModel("medium", device="cuda", compute_type="float16") -language_info = model.detect_language_multi_segment("audio.mp3") -``` - -### Batched faster-whisper - -The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper. - -```python -from faster_whisper import WhisperModel, BatchedInferencePipeline - -model = WhisperModel("medium", device="cuda", compute_type="float16") -batched_model = BatchedInferencePipeline(model=model) -segments, info = batched_model.transcribe("audio.mp3", batch_size=16) - -for segment in segments: - print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) -``` - -### Faster Distil-Whisper - -The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3) -checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet -demonstrates how to run inference with distil-large-v3 on a specified audio file: - -```python -from faster_whisper import WhisperModel - -model_size = "distil-large-v3" - -model = WhisperModel(model_size, device="cuda", compute_type="float16") -segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False) - -for segment in segments: - print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) -``` - -For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3). - -### Word-level timestamps - -```python -segments, _ = model.transcribe("audio.mp3", word_timestamps=True) - -for segment in segments: - for word in segment.words: - print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word)) +pip install modal ``` -### VAD filter - -The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech: - -```python -segments, _ = model.transcribe("audio.mp3", vad_filter=True) -``` - -The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: - -```python -segments, _ = model.transcribe( - "audio.mp3", - vad_filter=True, - vad_parameters=dict(min_silence_duration_ms=500), -) -``` - -### Logging - -The library logging level can be configured like this: - -```python -import logging - -logging.basicConfig() -logging.getLogger("faster_whisper").setLevel(logging.DEBUG) -``` - -### Going further - -See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. - -## Community integrations - -Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list! - - -* [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription. -* [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment -* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper. -* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. -* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS. -* [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines. -* [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT. -* [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor) -* [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux. -* [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art. -* [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time. -* [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface. - -## Model conversion - -When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran). - -We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. - -For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16: - +### 3. Setup the Modal +Set up Modal authentication. This will open a browser window for you to authorize access to your Modal account: ```bash -pip install transformers[torch]>=4.23 - -ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2 ---copy_files tokenizer.json preprocessor_config.json --quantization float16 +python3 -m modal setup ``` -* The option `--model` accepts a model name on the Hub or a path to a model directory. -* If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later. - -Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). - -### Load a converted model - -1. Directly load the model from a local directory: -```python -model = faster_whisper.WhisperModel("whisper-large-v3-ct2") +### 4. Deploying the App on Modal +Deploy the app on Modal and get the app link from terminal/Modal Dashboard +```bash +modal deploy app.py ``` -2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: -```python -model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2") -``` - -## Comparing performance against other implementations - -If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular: +### 5. Test Deployed App: +After the code is deployed, retrieve the app link from the Modal.com Dashboard. The app link will look similar to: -* Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5. -* When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script: - -```bash -OMP_NUM_THREADS=4 python3 my_script.py -``` +```bash +curl --location 'https://your-name--faster-whisper-server-fastapi-wrapper.modal.run/transcribe' \ +--form 'file=@"/home/user/Desktop/locean-et-lhumanite-destins-lies-lamya-essemlali-tedxorleans-128-ytshorts.savetube.me.mp3"' +``` \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 00000000..9bfdaef1 --- /dev/null +++ b/app.py @@ -0,0 +1,77 @@ +from io import BytesIO + +import modal + +from fastapi import FastAPI, File, HTTPException, UploadFile +from fastapi.responses import JSONResponse +import modal.gpu + +from faster_whisper import WhisperModel + +web_app = FastAPI() + +model = None + + +def initialize_model(): + global model + model_size = "large-v3" + model = WhisperModel(model_size, device="cuda", compute_type="float16") + print(f"Model {model_size} weights loaded on GPU") + + +@web_app.post("/transcribe") +async def transcribe_audio(file: UploadFile = File(...)): + supported_audio_extensions = (".mp3", ".wav", ".flac") + if not file.filename.lower().endswith(supported_audio_extensions): + raise HTTPException(status_code=400, detail="File type not supported. Please upload MP3, WAV, or FLAC files.") + + audio_data = BytesIO(await file.read()) + + if model is None: + raise HTTPException(status_code=500, detail="Model is not initialized") + + segments, info = model.transcribe(audio_data, beam_size=5) + + response = { + "language": info.language, + "language_probability": info.language_probability, + "transcription": [{"start": segment.start, "end": segment.end, "text": segment.text} for segment in segments], + } + + return JSONResponse(content=response) + + +app = modal.App("faster-whisper-server") + +image = ( + modal.Image.from_registry("nvidia/cuda:12.0.1-cudnn8-runtime-ubuntu20.04", add_python="3.11") + .pip_install( + "ctranslate2==4.4.0", + "huggingface_hub>=0.13", + "tokenizers>=0.13,<1", + "onnxruntime>=1.14,<2", + "pyannote-audio>=3.1.1", + "torch", + "av>=11", + "google", + "tqdm", + "fastapi==0.115.3", + "faster-whisper==1.0.3", + "uvicorn==0.32.0", + "python-multipart", + "nvidia-cublas-cu11", + "nvidia-cudnn-cu11", + ) + .copy_local_file("download_weights.py", "/root/download_weights.py") + .copy_local_file("helper.sh", "/root/helper.sh") + .run_commands("python3 /root/download_weights.py") + .run_commands("bash /root/helper.sh") +) + + +@app.function(image=image, gpu=[modal.gpu.H100()], concurrency_limit=1, container_idle_timeout=600) +@modal.asgi_app() +def fastapi_wrapper(): + initialize_model() + return web_app diff --git a/download_weights.py b/download_weights.py new file mode 100644 index 00000000..5ece5385 --- /dev/null +++ b/download_weights.py @@ -0,0 +1,4 @@ +from faster_whisper.utils import download_model + +model = download_model(size_or_id="large-v3", local_files_only=False) +print(f"Model large-v3 weights loaded on GPU") diff --git a/helper.sh b/helper.sh new file mode 100644 index 00000000..372eb6ea --- /dev/null +++ b/helper.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +EXPORT_COMMAND='export LD_LIBRARY_PATH=`python3 -c "import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + \":\" + os.path.dirname(nvidia.cudnn.lib.__file__))"`' + +if ! grep -Fxq "$EXPORT_COMMAND" /root/.bashrc; then + echo "$EXPORT_COMMAND" >> /root/.bashrc + echo "Added LD_LIBRARY_PATH update to /root/.bashrc." +else + echo "LD_LIBRARY_PATH update is already present in /root/.bashrc." +fi diff --git a/requirements.txt b/requirements.txt index 71fc482e..6a319623 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,11 @@ ctranslate2>=4.0,<5 huggingface_hub>=0.13 tokenizers>=0.13,<1 onnxruntime>=1.14,<2 +pyannote-audio>=3.1.1 torch>=2.1.1 av>=11 -tqdm \ No newline at end of file +tqdm +fastapi==0.115.3 +faster-whisper==1.0.3 +uvicorn==0.32.0 +modal==0.64.222