Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speech-to-Text with Huggingface Whisper V3 #26

Open
Bensonheimer992 opened this issue Oct 1, 2024 · 0 comments
Open

Speech-to-Text with Huggingface Whisper V3 #26

Bensonheimer992 opened this issue Oct 1, 2024 · 0 comments

Comments

@Bensonheimer992
Copy link

Hey im trying to make a little Python Script for Transcribing Voice to Text with the Huggingface Inferrence API but nothing happens can someone maybe help me ?

import argparse
import asyncio
from functools import partial

from huggingface_hub import model_info
from wyoming.flycheck_server import AsyncServer
from wyoming.info import AsrModel, AsrProgram, Attribution, Info
import logging
from handler import HuggingfaceWhisper

LOGGER = logging.getLogger(__name__)

async def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--key", required=True, help="Your Huggingface API Key")
    parser.add_argument("--uri", required=True, help="unix:// or tcp://")

    args = parser.parse_args()

    wyoming_info = Info(
        asr=[
            AsrProgram(
                name="Huggingface Whisper",
                description="Faster Whisper transcription with Whisper Large V3",
                attribution=Attribution(
                    name="Bensonheimer992",
                    url="https://github.com/Bensonheimer992"
                ),
                installed=True,
                version="1.0",
                models=[
                    AsrModel(
                        name="Whisper Large V3",
                        description="The Large Whisper Model",
                        attribution=Attribution(
                            name="OpenAI",
                            url="https://huggingface.co/openai",
                        ),
                        installed=True,
                        languages=["de", "en"],
                        version="3.0",
                    )
                ],
            )
        ],
    )

    server = AsyncServer.from_uri(args.uri)
    LOGGER.info("Ready!")
    lock = asyncio.Lock()
    await server.run(
        partial(
            HuggingfaceWhisper,
            wyoming_info,
            args,
            lock
        )
    )

if __name__ == "__main__":
    asyncio.run(main())
import argparse
import asyncio
import logging
import os.path
import tempfile
import wave
from typing import Optional

import aiohttp
from wyoming.asr import Transcript, Transcribe
from wyoming.audio import AudioChunk, AudioStop
from wyoming.event import Event
from wyoming.info import Info, Describe
from wyoming.server import AsyncEventHandler

LOGGER = logging.getLogger(__name__)

API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"

class HuggingfaceWhisper(AsyncEventHandler):
    def __init__(self, wyoming_info: Info, cliargs: argparse.Namespace, lock: asyncio.Lock, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.cliargs = cliargs
        self.wyoming_info_event = wyoming_info.event()
        self.lock = lock
        self.wyoming_info_event = wyoming_info.event()
        self.wavdir = tempfile.TemporaryDirectory()
        self.wavpath = os.path.join(self.wavdir.name, "speech.wav")
        self.wavfile = Optional[wave.Wave_write]

    async def handle_event(self, event: Event) -> bool:
        if AudioChunk.is_type(event.type):
            chunk = AudioChunk.from_event(event)
            if self.wavfile is None:
                self.wavfile = wave.open(self.wavpath, "wb")
                self.wavfile.setframerate(chunk.rate)
                self.wavfile.setsampwidth(chunk.width)
                self.wavfile.setnchannels(chunk.channels)

            self.wavfile.writeframes(chunk.audio)
            return True

        if AudioStop.is_type(event.type):
            LOGGER.debug("Audio Stopped. Transcribing ...")
            assert self.wavfile is not None

            self.wavfile.close()
            self.wavfile = None

            async with self.lock:
                try:
                    headers = {"Authorization": f"Bearer {self.cliargs.key}"}
                    async with aiohttp.ClientSession() as session:
                        with open(self.wavpath, "rb") as f:
                            data = f.read()
                            async with session.post(API_URL, headers=headers, data=data) as response:
                                if response.status == 200:
                                    result = await response.json()
                                    text = result.get('text', '')
                                    LOGGER.info("Transcription Recieved")
                                    await self.write_event(Transcript(text=text).event())
                                else:
                                    LOGGER.error(f"Error from Huggingface API: {response.status}")
                except Exception as e:
                    LOGGER.error(f"Error during Transcription: {str(e)}")

        if Transcribe.is_type(event.type):
            return True

        if Describe.is_type(event.type):
            await self.write_event(self.wyoming_info_event)
            LOGGER.debug("Sent Info")
            return True

        return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant