Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
kaiidams committed Aug 8, 2023
1 parent bd1e0f6 commit 92dd49e
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 10 deletions.
23 changes: 22 additions & 1 deletion NeMoOnnxSharp/AudioFeatureBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ internal class AudioFeatureBuffer

public AudioFeatureBuffer(int stftHopLength = 160, int stftWindowLength = 400, int nMelBands = 64)
{
#if false
_processor = new AudioProcessor(
sampleRate: 16000,
window: WindowFunction.Hann,
Expand All @@ -41,10 +42,30 @@ public AudioFeatureBuffer(int stftHopLength = 160, int stftWindowLength = 400, i
melNormalize: MelNormalizeType.None,
logOffset: 1e-6,
postNormalize: false);
#else
_processor = new AudioProcessor(
sampleRate: 16000,
window: WindowFunction.Hann,
windowLength: 400,
hopLength: 160,
fftLength: 512,
preNormalize: 0.0,
preemph: 0.0,
center: false,
nMelBands: 64,
nMFCC: 64,
melMinHz: 0.0,
melMaxHz: 0.0,
htk: true,
melNormalize: MelNormalizeType.None,
logOffset: 1e-6,
postNormalize: false);
#endif
_stftHopLength = stftHopLength;
_stftWindowLength = stftWindowLength;
_nMelBands = nMelBands;
_audioScale = 0.5 / short.MaxValue;
// _audioScale = 0.5 / short.MaxValue;
_audioScale = 1.0 / short.MaxValue;

_waveformBuffer = new short[2 * _stftHopLength + _stftWindowLength];
_waveformCount = 0;
Expand Down
27 changes: 23 additions & 4 deletions NeMoOnnxSharp/FrameVAD.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ public string Transcribe(short[] waveform)
for (int j = 0; j + windowLength < waveform.Length; j += stepSize)
{
var waveform2 = waveform.AsSpan(j, windowLength).ToArray();
var audioSignal = _processor.MFCC(waveform2);
audioSignal = Transpose(audioSignal, _nMelBands);
var processedSignal = _processor.MFCC(waveform2);
processedSignal = Transpose(processedSignal, _nMelBands);
var container = new List<NamedOnnxValue>();
var audioSignalData = new DenseTensor<float>(
audioSignal,
new int[3] { 1, _nMelBands, audioSignal.Length / _nMelBands });
processedSignal,
new int[3] { 1, _nMelBands, processedSignal.Length / _nMelBands });
container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData));
using (var res = _inferSess.Run(container, new string[] { "logits" }))
{
Expand All @@ -85,6 +85,25 @@ public string Transcribe(short[] waveform)
return text;
}

public string TranscribeStep(float[] processedSignal)
{
processedSignal = Transpose(processedSignal, _nMelBands);
var container = new List<NamedOnnxValue>();
var audioSignalData = new DenseTensor<float>(
processedSignal,
new int[3] { 1, _nMelBands, processedSignal.Length / _nMelBands });
container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData));
string text;
using (var res = _inferSess.Run(container, new string[] { "logits" }))
{
var scoreTensor = res.First();
float[] scores = scoreTensor.AsTensor<float>().ToArray();
int score = (int)(10 / (1 + Math.Exp(scores[0] - scores[1])));
text = (scores[0] > scores[1]) ? _labels[0] : _labels[1];
}
return text;
}

private float[] Transpose(float[] x, int cols)
{
var y = new float[x.Length];
Expand Down
19 changes: 14 additions & 5 deletions NeMoOnnxSharp/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
using System.Runtime.InteropServices;
using System.Net.Sockets;
using System.Collections.Generic;
using System.Diagnostics;

namespace NeMoOnnxSharp
{
Expand Down Expand Up @@ -134,6 +135,9 @@ private static void RunFileStreamAudio(string basePath, string modelPath)
var audioSignal = new List<short>();
int c = 0;

var sw = new Stopwatch();
sw.Reset();
sw.Start();
while (true)
{
int bytesReceived = stream.Read(responseBytes);
Expand All @@ -149,19 +153,21 @@ private static void RunFileStreamAudio(string basePath, string modelPath)
{
int written = buffer.Write(x, offset, x.Length - offset);
offset += written;
while (buffer.OutputCount >= 16000 + 400)
int ws = (int)(16000 * 0.31 / 160 * 64);
int ss = (int)(16000 * 0.01 / 160 * 64);
while (buffer.OutputCount >= ws)
{
var y = buffer.OutputCount;
Console.Write(".");
var y = buffer.OutputBuffer.AsSpan(0, ws);
string text = vad.TranscribeStep(y.ToArray());
Console.Write(text == "speech" ? "X" : ".");
++c;
if (c % 60 == 0)
{
c = 0;
Console.WriteLine();
}
buffer.ConsumeOutput(64);
buffer.ConsumeOutput(ss);
}

}

if (false)
Expand All @@ -175,6 +181,8 @@ private static void RunFileStreamAudio(string basePath, string modelPath)
}
}
}
sw.Stop();
Console.WriteLine("{0}/{1}", sw.ElapsedMilliseconds, stream.Position / 32000.0);
}

private static MemoryStream GetAllAudioStream(string basePath)
Expand All @@ -192,6 +200,7 @@ private static MemoryStream GetAllAudioStream(string basePath)
var waveform = WaveFile.ReadWAV(waveFile, 16000);
var bytes = MemoryMarshal.Cast<short, byte>(waveform);
stream.Write(bytes);
stream.Write(new byte[32000]);
}
stream.Seek(0, SeekOrigin.Begin);
return stream;
Expand Down

0 comments on commit 92dd49e

Please sign in to comment.