Skip to content

Commit

Permalink
Add English G2P and tokenizer. (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
kaiidams authored Sep 30, 2023
1 parent 28b439b commit 4ad71cb
Show file tree
Hide file tree
Showing 10 changed files with 808 additions and 0 deletions.
10 changes: 10 additions & 0 deletions NeMoOnnxSharp.Example/PretrainedModelInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ private static PretrainedModelInfo[] CreateModelList()
"https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/commandrecognition_en_matchboxnet3x1x64_v2.onnx",
"a0c5e4d14e83d3b6afdaf239265a390c2ca513bcdedf3d295bc1f9f97f19868a"
),
new PretrainedModelInfo(
"cmudict-0.7b_nv22.10",
"https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/cmudict-0.7b_nv22.10",
"d330f3a3554d4c7ff8ef7bfc0c338ed74831d5f54109508fb829bdd82173608b"
),
new PretrainedModelInfo(
"heteronyms-052722",
"https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/heteronyms-052722",
"b701909aedf753172eff223950f8859cd4b9b4c80199cf0a6e9ac4a307c8f8ec"
),
new PretrainedModelInfo(
"tts_en_hifigan",
"https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_hifigan.onnx",
Expand Down
22 changes: 22 additions & 0 deletions NeMoOnnxSharp.Example/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Text;
using System.Threading.Tasks;
using System.Collections.Generic;
using NeMoOnnxSharp.TTSTokenizers;

namespace NeMoOnnxSharp.Example
{
Expand All @@ -19,6 +20,10 @@ static async Task Main(string[] args)
{
await Transcribe();
}
else if (task == "speak")
{
await Speak();
}
else if (task == "vad")
{
await FramePredict(false);
Expand Down Expand Up @@ -54,6 +59,23 @@ static async Task Transcribe()
}
}

static async Task Speak()
{
string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
string phoneDict = await DownloadModelAsync("cmudict-0.7b_nv22.10");
string heteronyms = await DownloadModelAsync("heteronyms-052722");
var g2p = new EnglishG2p(phoneDict, heteronyms);
var tokenizer = new EnglishPhonemesTokenizer(
g2p,
punct: true,
stresses: true,
chars: true,
apostrophe: true,
padWithSpace: true,
addBlankAt: BaseTokenizer.AddBlankAt.True);
tokenizer.Encode("Hello world!");
}

static async Task FramePredict(bool mbn)
{
string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
Expand Down
21 changes: 21 additions & 0 deletions NeMoOnnxSharp.Tests/Data/cmudict-test
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Comment
YOU'VE Y UW1 V
READ R EH1 D
READ(1) R IY1 D
BOOK B UH1 K
THE DH AH0
THE(1) DH AH1
THE(2) DH IY0
OPERATING AA1 P ER0 EY2 T IH0 NG
OPERATING(1) AO1 P ER0 EY2 T IH0 NG
SYSTEM S IH1 S T AH0 M
DESIGN D IH0 Z AY1 N
AND AH0 N D
AND(1) AE1 N D
IMPLEMENTATION IH2 M P L AH0 M EH0 N T EY1 SH AH0 N
THIRD TH ER1 D
EDITION AH0 D IH1 SH AH0 N
EDITION(1) IH0 D IH1 SH AH0 N
DID D IH1 D
DID(1) D IH0 D
YOU Y UW1
1 change: 1 addition & 0 deletions NeMoOnnxSharp.Tests/Data/heteronyms-test
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
read
6 changes: 6 additions & 0 deletions NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@
</ItemGroup>

<ItemGroup>
<None Update="Data\cmudict-test">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Data\heteronyms-test">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Data\make_test.py">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
Expand Down
89 changes: 89 additions & 0 deletions NeMoOnnxSharp.Tests/TextTokenizersTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using Microsoft.VisualStudio.TestTools.UnitTesting;
using NeMoOnnxSharp.TTSTokenizers;
using System;
using System.Diagnostics;
using System.IO;

namespace NeMoOnnxSharp.Tests
{
[TestClass]
public class TextTokenizersTest
{
private readonly static string[] ExpectedTokens =
{
" ", "B", "CH", "D", "DH", "F", "G", "HH", "JH", "K", "L", "M",
"N", "NG", "P", "R", "S", "SH", "T", "TH", "V", "W", "Y", "Z", "ZH",
"AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0",
"AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "EH0", "EH1",
"EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "IH0", "IH1", "IH2",
"IY0", "IY1", "IY2", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "UH0",
"UH1", "UH2", "UW0", "UW1", "UW2", "a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
"v", "w", "x", "y", "z", "'", ",", ".", "!", "?", "-", ":", ";", "/",
"\"", "(", ")", "[", "]", "{", "}", "<pad>", "<blank>", "<oov>"
};

private const string SampleText =
"You've read the book “Operating Systems Design and Implementation, 3rd edition”. Did you?";
private const string NormalizedSampleText =
"You've read the book “Operating Systems Design and Implementation, third edition”. Did you?";
private const string SamplePronText =
"Y|UW1|V| |r|e|a|d| |t|h|e| |B|UH1|K| |“|o|p|e|r|a|t|i|n|g| |"
+ "S|IH1|S|T|AH0|M|Z| |D|IH0|Z|AY1|N| |a|n|d| |IH2|M|P|L|AH0|"
+ "M|EH0|N|T|EY1|SH|AH0|N|,| |TH|ER1|D| |e|d|i|t|i|o|n|”|.| |"
+ "d|i|d| |Y|UW1|?";

private readonly static int[] SampleParsed =
{
0, 22, 68, 20, 0, 87, 74, 70, 73, 0, 89, 77, 74,
0, 1, 65, 9, 0, 105, 84, 85, 74, 87, 70, 89, 78,
83, 76, 0, 16, 53, 16, 18, 31, 11, 23, 0, 3, 52,
23, 41, 12, 0, 70, 83, 73, 0, 54, 11, 14, 10, 31,
11, 43, 12, 18, 50, 17, 31, 12, 97, 0, 19, 47, 3,
0, 74, 73, 78, 89, 78, 84, 83, 105, 98, 0, 73, 78,
73, 0, 22, 68, 100, 0
};

[TestInitialize]
public void Initialize()
{
string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
_g2p = new EnglishG2p(
phonemeDict: Path.Combine(appDirPath, "Data", "cmudict-test"),
heteronyms: Path.Combine(appDirPath, "Data", "heteronyms-test"),
phonemeProbability: 1.0);
_tokenizer = new EnglishPhonemesTokenizer(
_g2p,
punct: true,
stresses: true,
chars: true,
apostrophe: true,
padWithSpace: true,
addBlankAt: BaseTokenizer.AddBlankAt.True);
}

[TestMethod]
public void TestTokenizerVocab()
{
CollectionAssert.AreEquivalent(ExpectedTokens, _tokenizer.Tokens);

Check warning on line 68 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs

View workflow job for this annotation

GitHub Actions / build

Dereference of a possibly null reference.

Check warning on line 68 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs

View workflow job for this annotation

GitHub Actions / build

Dereference of a possibly null reference.
}

[TestMethod]
public void TestEnglishG2p()
{
var pron = string.Join("|", _g2p.Parse(NormalizedSampleText));

Check warning on line 74 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs

View workflow job for this annotation

GitHub Actions / build

Dereference of a possibly null reference.

Check warning on line 74 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs

View workflow job for this annotation

GitHub Actions / build

Dereference of a possibly null reference.
Assert.AreEqual(SamplePronText, pron);
}

[TestMethod]
public void TestEnglishEncode()
{
var parsed = _tokenizer.Encode(NormalizedSampleText);

Check warning on line 81 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs

View workflow job for this annotation

GitHub Actions / build

Dereference of a possibly null reference.

Check warning on line 81 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs

View workflow job for this annotation

GitHub Actions / build

Dereference of a possibly null reference.
CollectionAssert.AreEquivalent(SampleParsed, parsed);
}

private EnglishG2p? _g2p;
private EnglishPhonemesTokenizer? _tokenizer;

}
}
75 changes: 75 additions & 0 deletions NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright (c) Katsuya Iida. All Rights Reserved.
// See LICENSE in the project root for license information.

// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
// largely located in the files found in this folder:
//
// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
//
// The origin has the following copyright notice and license:
//
// https://github.com/NVIDIA/NeMo/blob/main/LICENSE
//

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace NeMoOnnxSharp.TTSTokenizers
{
public abstract class BaseTokenizer
{
public enum AddBlankAt
{
False,
True,
Last
}

protected const string Pad = "<pad>";
protected const string Blank = "<blank>";
protected const string OOV = "<oov>";

protected BaseTokenizer()
{
_sep = string.Empty;
_id2token = Array.Empty<string>();
_token2id = new Dictionary<string, int>();
_utilIds = new HashSet<int>();
}

/// <summary>
/// Turns str text into int tokens.
/// </summary>
public abstract int[] Encode(string text);

/// <summary>
/// Turns ints tokens into str text.
/// </summary>
public string Decode(int[] tokens)
{
return string.Join(
_sep,
tokens
.Where(t => !_utilIds.Contains(t))
.Select(t => _id2token[t]));
}

public string[] Tokens { get { return _id2token; } }
public int PadId { get { return _pad; } }
public int BlankId { get { return _blank; } }
public int OOVId { get { return _oov; } }
public string Sep { get { return _sep; } }

protected string[] _id2token;
protected IDictionary<string, int> _token2id;
protected ISet<int> _utilIds;
protected int _space;
protected int _pad;
protected int _blank;
protected int _oov;
protected string _sep;
protected bool _padWithSpace;
}
}
Loading

0 comments on commit 4ad71cb

Please sign in to comment.