From 73e48a57695c8c962a54a8aa716a309f52a11ecb Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sun, 6 Aug 2023 17:00:54 +0900 Subject: [PATCH] VAD socket --- NeMoOnnxSharp/Program.cs | 37 ++++++++++++++++- NeMoOnnxSharp/Settings.cs | 1 + NeMoOnnxSharp/appsettings.json | 3 +- Python/audio_server.py | 73 ++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 Python/audio_server.py diff --git a/NeMoOnnxSharp/Program.cs b/NeMoOnnxSharp/Program.cs index bd7b2dd..c6317f6 100644 --- a/NeMoOnnxSharp/Program.cs +++ b/NeMoOnnxSharp/Program.cs @@ -8,10 +8,12 @@ using System.Text; using System.Threading.Tasks; using System.Runtime.InteropServices; +using System.Net.Sockets; +using System.Collections.Generic; namespace NeMoOnnxSharp { - internal class Program + internal static class Program { private static string AppName = "NeMoOnnxSharp"; @@ -57,6 +59,12 @@ static async Task Main(string[] args) } else if (settings.Model == "vad_marblenet") { + if (settings.Task == "socketaudio") + { + RunSocketAudio(modelPath); + return; + } + string inputDirPath = Path.Combine(basePath, "..", "..", "..", "..", "test_data"); string inputPath = Path.Combine(inputDirPath, "transcript.txt"); @@ -85,6 +93,33 @@ static async Task Main(string[] args) } } + private static void RunSocketAudio(string modelPath) + { + using var vad = new FrameVAD(modelPath); + using Socket socket = new Socket(SocketType.Stream, ProtocolType.Tcp); + socket.Connect("127.0.0.1", 17843); + Console.WriteLine("Connected"); + byte[] responseBytes = new byte[1024]; + var audioSignal = new List(); + while (true) + { + int bytesReceived = socket.Receive(responseBytes); + if (bytesReceived == 0) break; + if (bytesReceived % 2 != 0) + { + // TODO + throw new InvalidDataException(); + } + audioSignal.AddRange(MemoryMarshal.Cast(responseBytes.AsSpan(0, bytesReceived)).ToArray()); + if (audioSignal.Count > 16000) + { + string text = vad.Transcribe(audioSignal.ToArray()); + Console.WriteLine("text: {0}", text); + audioSignal.Clear(); + } + } + } + private static float[] ReadBinaryBuffer(string path) { using var stream = File.Open(path, FileMode.Open); diff --git a/NeMoOnnxSharp/Settings.cs b/NeMoOnnxSharp/Settings.cs index 4d0273b..0b40771 100644 --- a/NeMoOnnxSharp/Settings.cs +++ b/NeMoOnnxSharp/Settings.cs @@ -6,5 +6,6 @@ namespace NeMoOnnxSharp internal class Settings { public string Model { get; set; } + public string Task { get; set; } } } \ No newline at end of file diff --git a/NeMoOnnxSharp/appsettings.json b/NeMoOnnxSharp/appsettings.json index 91fca9d..1208478 100644 --- a/NeMoOnnxSharp/appsettings.json +++ b/NeMoOnnxSharp/appsettings.json @@ -1,5 +1,6 @@ { "Settings": { - "Model": "vad_marblenet" + "Model": "vad_marblenet", + "Task": "socketaudio" } } \ No newline at end of file diff --git a/Python/audio_server.py b/Python/audio_server.py new file mode 100644 index 0000000..be044c7 --- /dev/null +++ b/Python/audio_server.py @@ -0,0 +1,73 @@ +import pyaudio as pa +import socket +import time + +SAMPLE_RATE = 16000 +CHANNELS = 1 +CHUNK_SIZE = 1024 +PORT = 17843 + + +def main(): + p = pa.PyAudio() + + input_devices = [] + for i in range(p.get_device_count()): + dev = p.get_device_info_by_index(i) + if dev.get('maxInputChannels', 0) >= 1: + device_name = dev.get('name') + input_devices.append(device_name) + + if not input_devices: + print('No audio input device found.') + + if False: + print('Available audio input devices:') + for i, device_name in enumerate(input_devices): + print(f'{i}: {device_name}') + + device_index = 0 + device_name = input_devices[device_index] + print(f'Using audio input device: {device_index} {device_name}') + + serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + serversocket.bind(("0.0.0.0", PORT)) + serversocket.listen(1) + print(f'Listening TCP port {PORT}') + + + while True: + (clientsocket, address) = serversocket.accept() + + empty_counter = 0 + + def callback(in_data, frame_count, time_info, status): + clientsocket.send(in_data) + return (in_data, pa.paContinue) + + stream = p.open(format=pa.paInt16, + channels=CHANNELS, + rate=SAMPLE_RATE, + input=True, + input_device_index=device_index, + stream_callback=callback, + frames_per_buffer=CHUNK_SIZE) + + stream.start_stream() + + try: + while stream.is_active(): + time.sleep(0.1) + except: + pass + finally: + stream.stop_stream() + stream.close() + p.terminate() + + print("Connection closed") + + clientsocket.close() + + +main() \ No newline at end of file