From 91e866b2bab32897b626e2132631d7db19aff8fc Mon Sep 17 00:00:00 2001
From: Aman Jain <amanjain5221@gmail.com>
Date: Sat, 26 Oct 2024 13:03:59 +0530
Subject: [PATCH] Add modal integration for STT model

---
 README.md           | 321 ++++----------------------------------------
 app.py              |  77 +++++++++++
 download_weights.py |   4 +
 helper.sh           |  10 ++
 requirements.txt    |   7 +-
 5 files changed, 125 insertions(+), 294 deletions(-)
 create mode 100644 app.py
 create mode 100644 download_weights.py
 create mode 100644 helper.sh
diff --git a/README.md b/README.md
index 7cb96056..a42591f4 100644
--- a/README.md
+++ b/README.md
@@ -1,316 +1,51 @@
-[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
+# Large V3 Faster Whisper Modal Deployment On Modal.com 
 
-# Faster Whisper transcription with CTranslate2
+A FastAPI-based server that uses [Faster Whisper](https://github.com/guillaumekln/faster-whisper) for speech-to-text transcription, deployed on [modal.com](https://modal.com). This guide walks you through cloning, setting up, and deploying the server.
 
-**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.
+---
 
-This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.
+## Prerequisites
 
-## Benchmark
+- **Python 3.x**
+- **[Modal Account](https://modal.com)** for deployment
 
-### Whisper
+---
 
-For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations:
+## Installation Guide
 
-* [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258)
-* [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362)
-* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
+### 1. Clone the Repository
 
-### Large-v2 model on GPU
-
-| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
-| --- | --- | --- | --- | --- | --- |
-| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
-| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
-| faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB |
-
-*Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.*
-
-### Small model on CPU
-
-| Implementation | Precision | Beam size | Time | Max. memory |
-| --- | --- | --- | --- | --- |
-| openai/whisper | fp32 | 5 | 10m31s | 3101MB |
-| whisper.cpp | fp32 | 5 | 17m42s | 1581MB |
-| whisper.cpp | fp16 | 5 | 12m39s | 873MB |
-| faster-whisper | fp32 | 5 | 2m44s | 1675MB |
-| faster-whisper | int8 | 5 | 2m04s | 995MB |
-
-*Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.*
-
-
-### Distil-whisper
-
-| Implementation | Precision | Beam size | Time | Gigaspeech WER |
-| --- | --- | --- | --- | --- |
-| distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 |
-| [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 |
-| distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 |
-| [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 |
-
-*Executed with CUDA 11.4 on a NVIDIA 3090.*
-
-<details>
-<summary>testing details (click to expand)</summary>
-
-For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting:
-```python
-from faster_whisper import WhisperModel
-
-model_size = "distil-large-v2"
-# model_size = "distil-medium.en"
-# Run on GPU with FP16
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-segments, info = model.transcribe("audio.mp3", beam_size=5, language="en")
-```
-</details>
-
-## Requirements
-
-* Python 3.8 or greater
-
-
-### GPU
-
-GPU execution requires the following NVIDIA libraries to be installed:
-
-* [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas)
-* [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn)
-
-**Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`).
-
-There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. 
-
-<details>
-<summary>Other installation methods (click to expand)</summary>
-
-
-**Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below.
-
-#### Use Docker
-
-The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`.
-
-#### Install with `pip` (Linux only)
-
-On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python.
+Clone the `faster-whisper-modal` repository to your local machine:
 
 ```bash
-pip install nvidia-cublas-cu12 nvidia-cudnn-cu12
-
-export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
-```
-
-**Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8.
-
-#### Download the libraries from Purfview's repository (Windows & Linux)
-
-Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`.
-
-</details>
-
-## Installation
-
-The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/):
-
-```bash
-pip install faster-whisper
+git clone https://github.com/SYSTRAN/faster-whisper.git
+cd faster-whisper-modal
 ```
 
-<details>
-<summary>Other installation methods (click to expand)</summary>
-
-### Install the master branch
-
-```bash
-pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz"
-```
 
-### Install a specific commit
+### 2. Install the Modal SDK
+Install the Modal SDK for deploying applications to the Modal cloud:
 
 ```bash
-pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz"
-```
-
-</details>
-
-## Usage
-
-### Faster-whisper
-
-```python
-from faster_whisper import WhisperModel
-
-model_size = "large-v3"
-
-# Run on GPU with FP16
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-
-# or run on GPU with INT8
-# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
-# or run on CPU with INT8
-# model = WhisperModel(model_size, device="cpu", compute_type="int8")
-
-segments, info = model.transcribe("audio.mp3", beam_size=5)
-
-print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
-
-for segment in segments:
-    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
-```
-
-**Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop:
-
-```python
-segments, _ = model.transcribe("audio.mp3")
-segments = list(segments)  # The transcription will actually run here.
-```
-
-### multi-segment language detection
-
-To directly use the model for improved language detection, the following code snippet can be used:
-
-```python
-from faster_whisper import WhisperModel
-model = WhisperModel("medium", device="cuda", compute_type="float16")
-language_info = model.detect_language_multi_segment("audio.mp3")
-```
-
-### Batched faster-whisper
-
-The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
-
-```python
-from faster_whisper import WhisperModel, BatchedInferencePipeline
-
-model = WhisperModel("medium", device="cuda", compute_type="float16")
-batched_model = BatchedInferencePipeline(model=model)
-segments, info = batched_model.transcribe("audio.mp3", batch_size=16)
-
-for segment in segments:
-    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
-```
-
-### Faster Distil-Whisper
-
-The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)
-checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet 
-demonstrates how to run inference with distil-large-v3 on a specified audio file:
-
-```python
-from faster_whisper import WhisperModel
-
-model_size = "distil-large-v3"
-
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False)
-
-for segment in segments:
-    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
-```
-
-For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3).
-
-### Word-level timestamps
-
-```python
-segments, _ = model.transcribe("audio.mp3", word_timestamps=True)
-
-for segment in segments:
-    for word in segment.words:
-        print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
+pip install modal
 ```
 
-### VAD filter
-
-The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech:
-
-```python
-segments, _ = model.transcribe("audio.mp3", vad_filter=True)
-```
-
-The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`:
-
-```python
-segments, _ = model.transcribe(
-    "audio.mp3",
-    vad_filter=True,
-    vad_parameters=dict(min_silence_duration_ms=500),
-)
-```
-
-### Logging
-
-The library logging level can be configured like this:
-
-```python
-import logging
-
-logging.basicConfig()
-logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
-```
-
-### Going further
-
-See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
-
-## Community integrations
-
-Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
-
-
-* [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription.
-* [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
-* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
-* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
-* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS. 
-* [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines.
-* [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT.
-* [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor)
-* [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
-* [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
-* [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
-* [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface.
-
-## Model conversion
-
-When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
-
-We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
-
-For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
-
+### 3. Setup the Modal
+Set up Modal authentication. This will open a browser window for you to authorize access to your Modal account:
 ```bash
-pip install transformers[torch]>=4.23
-
-ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
---copy_files tokenizer.json preprocessor_config.json --quantization float16
+python3 -m modal setup
 ```
 
-* The option `--model` accepts a model name on the Hub or a path to a model directory.
-* If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
-
-Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
-
-### Load a converted model
-
-1. Directly load the model from a local directory:
-```python
-model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
+### 4. Deploying the App on Modal
+Deploy the app on Modal and get the app link from terminal/Modal Dashboard
+```bash 
+modal deploy app.py    
 ```
 
-2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
-```python
-model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
-```
-
-## Comparing performance against other implementations
-
-If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular:
+### 5. Test Deployed App:
+After the code is deployed, retrieve the app link from the Modal.com Dashboard. The app link will look similar to:
 
-* Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5.
-* When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script:
-
-```bash
-OMP_NUM_THREADS=4 python3 my_script.py
-```
+```bash 
+curl --location 'https://your-name--faster-whisper-server-fastapi-wrapper.modal.run/transcribe' \
+--form 'file=@"/home/user/Desktop/locean-et-lhumanite-destins-lies-lamya-essemlali-tedxorleans-128-ytshorts.savetube.me.mp3"'
+```
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 00000000..9bfdaef1
--- /dev/null
+++ b/app.py
@@ -0,0 +1,77 @@
+from io import BytesIO
+
+import modal
+
+from fastapi import FastAPI, File, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+import modal.gpu
+
+from faster_whisper import WhisperModel
+
+web_app = FastAPI()
+
+model = None
+
+
+def initialize_model():
+    global model
+    model_size = "large-v3"
+    model = WhisperModel(model_size, device="cuda", compute_type="float16")
+    print(f"Model {model_size} weights loaded on GPU")
+
+
+@web_app.post("/transcribe")
+async def transcribe_audio(file: UploadFile = File(...)):
+    supported_audio_extensions = (".mp3", ".wav", ".flac")
+    if not file.filename.lower().endswith(supported_audio_extensions):
+        raise HTTPException(status_code=400, detail="File type not supported. Please upload MP3, WAV, or FLAC files.")
+
+    audio_data = BytesIO(await file.read())
+
+    if model is None:
+        raise HTTPException(status_code=500, detail="Model is not initialized")
+
+    segments, info = model.transcribe(audio_data, beam_size=5)
+
+    response = {
+        "language": info.language,
+        "language_probability": info.language_probability,
+        "transcription": [{"start": segment.start, "end": segment.end, "text": segment.text} for segment in segments],
+    }
+
+    return JSONResponse(content=response)
+
+
+app = modal.App("faster-whisper-server")
+
+image = (
+    modal.Image.from_registry("nvidia/cuda:12.0.1-cudnn8-runtime-ubuntu20.04", add_python="3.11")
+    .pip_install(
+        "ctranslate2==4.4.0",
+        "huggingface_hub>=0.13",
+        "tokenizers>=0.13,<1",
+        "onnxruntime>=1.14,<2",
+        "pyannote-audio>=3.1.1",
+        "torch",
+        "av>=11",
+        "google",
+        "tqdm",
+        "fastapi==0.115.3",
+        "faster-whisper==1.0.3",
+        "uvicorn==0.32.0",
+        "python-multipart",
+        "nvidia-cublas-cu11",
+        "nvidia-cudnn-cu11",
+    )
+    .copy_local_file("download_weights.py", "/root/download_weights.py")
+    .copy_local_file("helper.sh", "/root/helper.sh")
+    .run_commands("python3 /root/download_weights.py")
+    .run_commands("bash /root/helper.sh")
+)
+
+
+@app.function(image=image, gpu=[modal.gpu.H100()], concurrency_limit=1, container_idle_timeout=600)
+@modal.asgi_app()
+def fastapi_wrapper():
+    initialize_model()
+    return web_app
diff --git a/download_weights.py b/download_weights.py
new file mode 100644
index 00000000..5ece5385
--- /dev/null
+++ b/download_weights.py
@@ -0,0 +1,4 @@
+from faster_whisper.utils import download_model
+
+model = download_model(size_or_id="large-v3", local_files_only=False)
+print(f"Model large-v3 weights loaded on GPU")
diff --git a/helper.sh b/helper.sh
new file mode 100644
index 00000000..372eb6ea
--- /dev/null
+++ b/helper.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+EXPORT_COMMAND='export LD_LIBRARY_PATH=`python3 -c "import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + \":\" + os.path.dirname(nvidia.cudnn.lib.__file__))"`'
+
+if ! grep -Fxq "$EXPORT_COMMAND" /root/.bashrc; then
+    echo "$EXPORT_COMMAND" >> /root/.bashrc
+    echo "Added LD_LIBRARY_PATH update to /root/.bashrc."
+else
+    echo "LD_LIBRARY_PATH update is already present in /root/.bashrc."
+fi
diff --git a/requirements.txt b/requirements.txt
index 71fc482e..6a319623 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,11 @@ ctranslate2>=4.0,<5
 huggingface_hub>=0.13
 tokenizers>=0.13,<1
 onnxruntime>=1.14,<2 
+pyannote-audio>=3.1.1
 torch>=2.1.1 
 av>=11
-tqdm
\ No newline at end of file
+tqdm
+fastapi==0.115.3
+faster-whisper==1.0.3
+uvicorn==0.32.0
+modal==0.64.222