Skip to content

Commit

Permalink
Remove AudioProcessor and rename.
Browse files Browse the repository at this point in the history
  • Loading branch information
kaiidams committed Aug 12, 2023
1 parent accd5bd commit c2d6491
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 62 deletions.
2 changes: 1 addition & 1 deletion NeMoOnnxSharp.Program/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ static async Task Main(string[] args)
string inputDirPath = Path.Combine(basePath, "..", "..", "..", "..", "test_data");
string inputPath = Path.Combine(inputDirPath, "transcript.txt");

var processor = new MFCCAudioProcessor(
var processor = new AudioToMFCCPreprocessor(
sampleRate: 16000,
window: WindowFunction.Hann,
windowLength: 400,
Expand Down
12 changes: 6 additions & 6 deletions NeMoOnnxSharp.Tests/AudioFeatureBufferTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ private static double MSE(double[] a, double[] b)
}

short[] waveform;
AudioProcessor processor;
AudioToMFCCPreprocessor processor;

public PreprocessorTest()
{
string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
string waveFile = Path.Combine(appDirPath, "Data", SampleWAVSpeechFile);
waveform = WaveFile.ReadWAV(waveFile, SampleRate);
processor = new MFCCAudioProcessor(
processor = new AudioToMFCCPreprocessor(
sampleRate: SampleRate,
window: WindowFunction.Hann,
windowLength: 400,
Expand Down Expand Up @@ -90,7 +90,7 @@ public void TestMelSpectrogram()
[TestMethod]
public void TestMFCC()
{
var processor = new MFCCAudioProcessor(
var processor = new AudioToMFCCPreprocessor(
sampleRate: SampleRate,
window: WindowFunction.Hann,
windowLength: 400,
Expand All @@ -115,14 +115,14 @@ public void TestReadFrame()
{
int windowLength = 5;
int fftLength = 9;
var processor = new MFCCAudioProcessor(
var processor = new AudioToMFCCPreprocessor(
windowLength: windowLength,
fftLength: fftLength,
preemph: 0.0);

MethodInfo methodInfo1 = typeof(AudioProcessor).GetMethod(
MethodInfo methodInfo1 = typeof(AudioToMFCCPreprocessor).GetMethod(
"ReadFrameCenter", BindingFlags.NonPublic | BindingFlags.Instance);
MethodInfo methodInfo2 = typeof(AudioProcessor).GetMethod(
MethodInfo methodInfo2 = typeof(AudioToMFCCPreprocessor).GetMethod(
"ReadFrameCenterPreemphasis", BindingFlags.NonPublic | BindingFlags.Instance);

var rng = new Random();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

namespace NeMoOnnxSharp
{
public abstract class AudioProcessor : IAudioProcessor<short, float>
public class AudioToMFCCPreprocessor : IAudioPreprocessor<short, float>
{
private enum FrameType
{
Expand Down Expand Up @@ -46,7 +46,7 @@ private static FrameType GetFrameType(bool center, double preemph)
private readonly double _postNormalizeOffset;
private readonly int _nMFCC;

public AudioProcessor(
public AudioToMFCCPreprocessor(
int sampleRate = 16000,
WindowFunction window = WindowFunction.Hann,
int windowLength = 0,
Expand Down Expand Up @@ -94,7 +94,10 @@ public AudioProcessor(
_postNormalizeOffset = postNormalizeOffset;
}

public abstract float[] Process(Span<short> input);
public float[] GetFeatures(Span<short> input)
{
return MFCC(input.ToArray());
}

public float[] MelSpectrogram(short[] waveform)
{
Expand Down
11 changes: 8 additions & 3 deletions NeMoOnnxSharp/AudioToMelSpectrogramPreprocessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

namespace NeMoOnnxSharp
{
public class AudioToMelSpectrogramProcessor
public class AudioToMelSpectrogramPreprocessor : IAudioPreprocessor<short, float>
{
private enum FrameType
{
Expand Down Expand Up @@ -46,7 +46,7 @@ private static FrameType GetFrameType(bool center, double preemph)
private readonly double _postNormalizeOffset;
private readonly int _nMFCC;

public AudioToMelSpectrogramProcessor(
public AudioToMelSpectrogramPreprocessor(
int sampleRate = 16000,
WindowFunction window = WindowFunction.Hann,
int windowLength = 0,
Expand Down Expand Up @@ -94,7 +94,12 @@ public AudioToMelSpectrogramProcessor(
_postNormalizeOffset = postNormalizeOffset;
}

public virtual float[] Process(short[] waveform)
public float[] GetFeatures(Span<short> waveform)
{
return GetFeatures(waveform.ToArray());
}

public float[] GetFeatures(short[] waveform)
{
double scale = GetScaleFactor(waveform);
int outputStep = _nMelBands;
Expand Down
6 changes: 3 additions & 3 deletions NeMoOnnxSharp/FrameVAD.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ namespace NeMoOnnxSharp
{
public class FrameVAD : IDisposable
{
private readonly AudioProcessor _processor;
private readonly IAudioPreprocessor<short, float> _processor;
private readonly InferenceSession _inferSess;
private readonly int _nMelBands;
private readonly string[] _labels;

private FrameVAD()
{
_nMelBands = 64;
_processor = new MFCCAudioProcessor(
_processor = new AudioToMFCCPreprocessor(
sampleRate: 16000,
window: WindowFunction.Hann,
windowLength: 400,
Expand Down Expand Up @@ -67,7 +67,7 @@ public string Transcribe(short[] waveform)
for (int j = 0; j + windowLength < waveform.Length; j += stepSize)
{
var waveform2 = waveform.AsSpan(j, windowLength).ToArray();
var processedSignal = _processor.MFCC(waveform2);
var processedSignal = _processor.GetFeatures(waveform2);
processedSignal = Transpose(processedSignal, _nMelBands);
var container = new List<NamedOnnxValue>();
var audioSignalData = new DenseTensor<float>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

namespace NeMoOnnxSharp
{
public interface IAudioProcessor<T1, T2>
public interface IAudioPreprocessor<T1, T2>
{
T2[] Process(Span<T1> input);
T2[] GetFeatures(Span<T1> input);
}
}
41 changes: 0 additions & 41 deletions NeMoOnnxSharp/MFCCAudioProcessor.cs

This file was deleted.

6 changes: 3 additions & 3 deletions NeMoOnnxSharp/SpeechRecognizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ public class SpeechRecognizer : ISpeechRecognizer
{
private const string Vocabulary = " abcdefghijklmnopqrstuvwxyz'_";

private readonly AudioToMelSpectrogramProcessor _processor;
private readonly IAudioPreprocessor<short, float> _processor;
private readonly CharTokenizer _tokenizer;
private readonly InferenceSession _inferSess;
private readonly int _nMelBands;

private SpeechRecognizer()
{
_nMelBands = 64;
_processor = new AudioToMelSpectrogramProcessor(
_processor = new AudioToMelSpectrogramPreprocessor(
sampleRate: 16000,
window: WindowFunction.Hann,
windowLength: 400,
Expand Down Expand Up @@ -62,7 +62,7 @@ public void Dispose()
public string Recognize(short[] waveform)
{
string text = string.Empty;
var audioSignal = _processor.Process(waveform);
var audioSignal = _processor.GetFeatures(waveform);
audioSignal = Transpose(audioSignal, _nMelBands);
var container = new List<NamedOnnxValue>();
var audioSignalData = new DenseTensor<float>(
Expand Down

0 comments on commit c2d6491

Please sign in to comment.