-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add English G2P and tokenizer. (#15)
- Loading branch information
Showing
10 changed files
with
808 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Comment | ||
YOU'VE Y UW1 V | ||
READ R EH1 D | ||
READ(1) R IY1 D | ||
BOOK B UH1 K | ||
THE DH AH0 | ||
THE(1) DH AH1 | ||
THE(2) DH IY0 | ||
OPERATING AA1 P ER0 EY2 T IH0 NG | ||
OPERATING(1) AO1 P ER0 EY2 T IH0 NG | ||
SYSTEM S IH1 S T AH0 M | ||
DESIGN D IH0 Z AY1 N | ||
AND AH0 N D | ||
AND(1) AE1 N D | ||
IMPLEMENTATION IH2 M P L AH0 M EH0 N T EY1 SH AH0 N | ||
THIRD TH ER1 D | ||
EDITION AH0 D IH1 SH AH0 N | ||
EDITION(1) IH0 D IH1 SH AH0 N | ||
DID D IH1 D | ||
DID(1) D IH0 D | ||
YOU Y UW1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
read |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
using Microsoft.VisualStudio.TestTools.UnitTesting; | ||
using NeMoOnnxSharp.TTSTokenizers; | ||
using System; | ||
using System.Diagnostics; | ||
using System.IO; | ||
|
||
namespace NeMoOnnxSharp.Tests | ||
{ | ||
[TestClass] | ||
public class TextTokenizersTest | ||
{ | ||
private readonly static string[] ExpectedTokens = | ||
{ | ||
" ", "B", "CH", "D", "DH", "F", "G", "HH", "JH", "K", "L", "M", | ||
"N", "NG", "P", "R", "S", "SH", "T", "TH", "V", "W", "Y", "Z", "ZH", | ||
"AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", | ||
"AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "EH0", "EH1", | ||
"EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "IH0", "IH1", "IH2", | ||
"IY0", "IY1", "IY2", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "UH0", | ||
"UH1", "UH2", "UW0", "UW1", "UW2", "a", "b", "c", "d", "e", "f", "g", | ||
"h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", | ||
"v", "w", "x", "y", "z", "'", ",", ".", "!", "?", "-", ":", ";", "/", | ||
"\"", "(", ")", "[", "]", "{", "}", "<pad>", "<blank>", "<oov>" | ||
}; | ||
|
||
private const string SampleText = | ||
"You've read the book “Operating Systems Design and Implementation, 3rd edition”. Did you?"; | ||
private const string NormalizedSampleText = | ||
"You've read the book “Operating Systems Design and Implementation, third edition”. Did you?"; | ||
private const string SamplePronText = | ||
"Y|UW1|V| |r|e|a|d| |t|h|e| |B|UH1|K| |“|o|p|e|r|a|t|i|n|g| |" | ||
+ "S|IH1|S|T|AH0|M|Z| |D|IH0|Z|AY1|N| |a|n|d| |IH2|M|P|L|AH0|" | ||
+ "M|EH0|N|T|EY1|SH|AH0|N|,| |TH|ER1|D| |e|d|i|t|i|o|n|”|.| |" | ||
+ "d|i|d| |Y|UW1|?"; | ||
|
||
private readonly static int[] SampleParsed = | ||
{ | ||
0, 22, 68, 20, 0, 87, 74, 70, 73, 0, 89, 77, 74, | ||
0, 1, 65, 9, 0, 105, 84, 85, 74, 87, 70, 89, 78, | ||
83, 76, 0, 16, 53, 16, 18, 31, 11, 23, 0, 3, 52, | ||
23, 41, 12, 0, 70, 83, 73, 0, 54, 11, 14, 10, 31, | ||
11, 43, 12, 18, 50, 17, 31, 12, 97, 0, 19, 47, 3, | ||
0, 74, 73, 78, 89, 78, 84, 83, 105, 98, 0, 73, 78, | ||
73, 0, 22, 68, 100, 0 | ||
}; | ||
|
||
[TestInitialize] | ||
public void Initialize() | ||
{ | ||
string appDirPath = AppDomain.CurrentDomain.BaseDirectory; | ||
_g2p = new EnglishG2p( | ||
phonemeDict: Path.Combine(appDirPath, "Data", "cmudict-test"), | ||
heteronyms: Path.Combine(appDirPath, "Data", "heteronyms-test"), | ||
phonemeProbability: 1.0); | ||
_tokenizer = new EnglishPhonemesTokenizer( | ||
_g2p, | ||
punct: true, | ||
stresses: true, | ||
chars: true, | ||
apostrophe: true, | ||
padWithSpace: true, | ||
addBlankAt: BaseTokenizer.AddBlankAt.True); | ||
} | ||
|
||
[TestMethod] | ||
public void TestTokenizerVocab() | ||
{ | ||
CollectionAssert.AreEquivalent(ExpectedTokens, _tokenizer.Tokens); | ||
Check warning on line 68 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs GitHub Actions / build
|
||
} | ||
|
||
[TestMethod] | ||
public void TestEnglishG2p() | ||
{ | ||
var pron = string.Join("|", _g2p.Parse(NormalizedSampleText)); | ||
Check warning on line 74 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs GitHub Actions / build
|
||
Assert.AreEqual(SamplePronText, pron); | ||
} | ||
|
||
[TestMethod] | ||
public void TestEnglishEncode() | ||
{ | ||
var parsed = _tokenizer.Encode(NormalizedSampleText); | ||
Check warning on line 81 in NeMoOnnxSharp.Tests/TextTokenizersTest.cs GitHub Actions / build
|
||
CollectionAssert.AreEquivalent(SampleParsed, parsed); | ||
} | ||
|
||
private EnglishG2p? _g2p; | ||
private EnglishPhonemesTokenizer? _tokenizer; | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
// Copyright (c) Katsuya Iida. All Rights Reserved. | ||
// See LICENSE in the project root for license information. | ||
|
||
// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, | ||
// largely located in the files found in this folder: | ||
// | ||
// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py | ||
// | ||
// The origin has the following copyright notice and license: | ||
// | ||
// https://github.com/NVIDIA/NeMo/blob/main/LICENSE | ||
// | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
|
||
namespace NeMoOnnxSharp.TTSTokenizers | ||
{ | ||
public abstract class BaseTokenizer | ||
{ | ||
public enum AddBlankAt | ||
{ | ||
False, | ||
True, | ||
Last | ||
} | ||
|
||
protected const string Pad = "<pad>"; | ||
protected const string Blank = "<blank>"; | ||
protected const string OOV = "<oov>"; | ||
|
||
protected BaseTokenizer() | ||
{ | ||
_sep = string.Empty; | ||
_id2token = Array.Empty<string>(); | ||
_token2id = new Dictionary<string, int>(); | ||
_utilIds = new HashSet<int>(); | ||
} | ||
|
||
/// <summary> | ||
/// Turns str text into int tokens. | ||
/// </summary> | ||
public abstract int[] Encode(string text); | ||
|
||
/// <summary> | ||
/// Turns ints tokens into str text. | ||
/// </summary> | ||
public string Decode(int[] tokens) | ||
{ | ||
return string.Join( | ||
_sep, | ||
tokens | ||
.Where(t => !_utilIds.Contains(t)) | ||
.Select(t => _id2token[t])); | ||
} | ||
|
||
public string[] Tokens { get { return _id2token; } } | ||
public int PadId { get { return _pad; } } | ||
public int BlankId { get { return _blank; } } | ||
public int OOVId { get { return _oov; } } | ||
public string Sep { get { return _sep; } } | ||
|
||
protected string[] _id2token; | ||
protected IDictionary<string, int> _token2id; | ||
protected ISet<int> _utilIds; | ||
protected int _space; | ||
protected int _pad; | ||
protected int _blank; | ||
protected int _oov; | ||
protected string _sep; | ||
protected bool _padWithSpace; | ||
} | ||
} |
Oops, something went wrong.