diff --git a/NeMoOnnxSharp/AudioFeatureBuffer.cs b/NeMoOnnxSharp/AudioFeatureBuffer.cs index 97fcb96..a6260f1 100644 --- a/NeMoOnnxSharp/AudioFeatureBuffer.cs +++ b/NeMoOnnxSharp/AudioFeatureBuffer.cs @@ -25,6 +25,7 @@ internal class AudioFeatureBuffer public AudioFeatureBuffer(int stftHopLength = 160, int stftWindowLength = 400, int nMelBands = 64) { +#if false _processor = new AudioProcessor( sampleRate: 16000, window: WindowFunction.Hann, @@ -41,10 +42,30 @@ public AudioFeatureBuffer(int stftHopLength = 160, int stftWindowLength = 400, i melNormalize: MelNormalizeType.None, logOffset: 1e-6, postNormalize: false); +#else + _processor = new AudioProcessor( + sampleRate: 16000, + window: WindowFunction.Hann, + windowLength: 400, + hopLength: 160, + fftLength: 512, + preNormalize: 0.0, + preemph: 0.0, + center: false, + nMelBands: 64, + nMFCC: 64, + melMinHz: 0.0, + melMaxHz: 0.0, + htk: true, + melNormalize: MelNormalizeType.None, + logOffset: 1e-6, + postNormalize: false); +#endif _stftHopLength = stftHopLength; _stftWindowLength = stftWindowLength; _nMelBands = nMelBands; - _audioScale = 0.5 / short.MaxValue; + // _audioScale = 0.5 / short.MaxValue; + _audioScale = 1.0 / short.MaxValue; _waveformBuffer = new short[2 * _stftHopLength + _stftWindowLength]; _waveformCount = 0; diff --git a/NeMoOnnxSharp/FrameVAD.cs b/NeMoOnnxSharp/FrameVAD.cs index d978193..d840e83 100644 --- a/NeMoOnnxSharp/FrameVAD.cs +++ b/NeMoOnnxSharp/FrameVAD.cs @@ -67,12 +67,12 @@ public string Transcribe(short[] waveform) for (int j = 0; j + windowLength < waveform.Length; j += stepSize) { var waveform2 = waveform.AsSpan(j, windowLength).ToArray(); - var audioSignal = _processor.MFCC(waveform2); - audioSignal = Transpose(audioSignal, _nMelBands); + var processedSignal = _processor.MFCC(waveform2); + processedSignal = Transpose(processedSignal, _nMelBands); var container = new List(); var audioSignalData = new DenseTensor( - audioSignal, - new int[3] { 1, _nMelBands, audioSignal.Length / _nMelBands }); + processedSignal, + new int[3] { 1, _nMelBands, processedSignal.Length / _nMelBands }); container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData)); using (var res = _inferSess.Run(container, new string[] { "logits" })) { @@ -85,6 +85,25 @@ public string Transcribe(short[] waveform) return text; } + public string TranscribeStep(float[] processedSignal) + { + processedSignal = Transpose(processedSignal, _nMelBands); + var container = new List(); + var audioSignalData = new DenseTensor( + processedSignal, + new int[3] { 1, _nMelBands, processedSignal.Length / _nMelBands }); + container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData)); + string text; + using (var res = _inferSess.Run(container, new string[] { "logits" })) + { + var scoreTensor = res.First(); + float[] scores = scoreTensor.AsTensor().ToArray(); + int score = (int)(10 / (1 + Math.Exp(scores[0] - scores[1]))); + text = (scores[0] > scores[1]) ? _labels[0] : _labels[1]; + } + return text; + } + private float[] Transpose(float[] x, int cols) { var y = new float[x.Length]; diff --git a/NeMoOnnxSharp/Program.cs b/NeMoOnnxSharp/Program.cs index 4d7c198..2734192 100644 --- a/NeMoOnnxSharp/Program.cs +++ b/NeMoOnnxSharp/Program.cs @@ -10,6 +10,7 @@ using System.Runtime.InteropServices; using System.Net.Sockets; using System.Collections.Generic; +using System.Diagnostics; namespace NeMoOnnxSharp { @@ -134,6 +135,9 @@ private static void RunFileStreamAudio(string basePath, string modelPath) var audioSignal = new List(); int c = 0; + var sw = new Stopwatch(); + sw.Reset(); + sw.Start(); while (true) { int bytesReceived = stream.Read(responseBytes); @@ -149,19 +153,21 @@ private static void RunFileStreamAudio(string basePath, string modelPath) { int written = buffer.Write(x, offset, x.Length - offset); offset += written; - while (buffer.OutputCount >= 16000 + 400) + int ws = (int)(16000 * 0.31 / 160 * 64); + int ss = (int)(16000 * 0.01 / 160 * 64); + while (buffer.OutputCount >= ws) { - var y = buffer.OutputCount; - Console.Write("."); + var y = buffer.OutputBuffer.AsSpan(0, ws); + string text = vad.TranscribeStep(y.ToArray()); + Console.Write(text == "speech" ? "X" : "."); ++c; if (c % 60 == 0) { c = 0; Console.WriteLine(); } - buffer.ConsumeOutput(64); + buffer.ConsumeOutput(ss); } - } if (false) @@ -175,6 +181,8 @@ private static void RunFileStreamAudio(string basePath, string modelPath) } } } + sw.Stop(); + Console.WriteLine("{0}/{1}", sw.ElapsedMilliseconds, stream.Position / 32000.0); } private static MemoryStream GetAllAudioStream(string basePath) @@ -192,6 +200,7 @@ private static MemoryStream GetAllAudioStream(string basePath) var waveform = WaveFile.ReadWAV(waveFile, 16000); var bytes = MemoryMarshal.Cast(waveform); stream.Write(bytes); + stream.Write(new byte[32000]); } stream.Seek(0, SeekOrigin.Begin); return stream;