From 4ad71cb525dfd6704a53f3dd98e76a6c5f6cd8b3 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 30 Sep 2023 14:04:15 +0900 Subject: [PATCH] Add English G2P and tokenizer. (#15) --- NeMoOnnxSharp.Example/PretrainedModelInfo.cs | 10 + NeMoOnnxSharp.Example/Program.cs | 22 ++ NeMoOnnxSharp.Tests/Data/cmudict-test | 21 ++ NeMoOnnxSharp.Tests/Data/heteronyms-test | 1 + .../NeMoOnnxSharp.Tests.csproj | 6 + NeMoOnnxSharp.Tests/TextTokenizersTest.cs | 89 +++++++ NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs | 75 ++++++ NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs | 213 +++++++++++++++++ .../TTSTokenizers/EnglishPhonemesTokenizer.cs | 224 ++++++++++++++++++ NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs | 147 ++++++++++++ 10 files changed, 808 insertions(+) create mode 100644 NeMoOnnxSharp.Tests/Data/cmudict-test create mode 100644 NeMoOnnxSharp.Tests/Data/heteronyms-test create mode 100644 NeMoOnnxSharp.Tests/TextTokenizersTest.cs create mode 100644 NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs create mode 100644 NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs create mode 100644 NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs create mode 100644 NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs diff --git a/NeMoOnnxSharp.Example/PretrainedModelInfo.cs b/NeMoOnnxSharp.Example/PretrainedModelInfo.cs index 999722f..ed2b6ad 100644 --- a/NeMoOnnxSharp.Example/PretrainedModelInfo.cs +++ b/NeMoOnnxSharp.Example/PretrainedModelInfo.cs @@ -45,6 +45,16 @@ private static PretrainedModelInfo[] CreateModelList() "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/commandrecognition_en_matchboxnet3x1x64_v2.onnx", "a0c5e4d14e83d3b6afdaf239265a390c2ca513bcdedf3d295bc1f9f97f19868a" ), + new PretrainedModelInfo( + "cmudict-0.7b_nv22.10", + "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/cmudict-0.7b_nv22.10", + "d330f3a3554d4c7ff8ef7bfc0c338ed74831d5f54109508fb829bdd82173608b" + ), + new PretrainedModelInfo( + "heteronyms-052722", + "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/heteronyms-052722", + "b701909aedf753172eff223950f8859cd4b9b4c80199cf0a6e9ac4a307c8f8ec" + ), new PretrainedModelInfo( "tts_en_hifigan", "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_hifigan.onnx", diff --git a/NeMoOnnxSharp.Example/Program.cs b/NeMoOnnxSharp.Example/Program.cs index 1ccf991..0bf9cb2 100644 --- a/NeMoOnnxSharp.Example/Program.cs +++ b/NeMoOnnxSharp.Example/Program.cs @@ -6,6 +6,7 @@ using System.Text; using System.Threading.Tasks; using System.Collections.Generic; +using NeMoOnnxSharp.TTSTokenizers; namespace NeMoOnnxSharp.Example { @@ -19,6 +20,10 @@ static async Task Main(string[] args) { await Transcribe(); } + else if (task == "speak") + { + await Speak(); + } else if (task == "vad") { await FramePredict(false); @@ -54,6 +59,23 @@ static async Task Transcribe() } } + static async Task Speak() + { + string appDirPath = AppDomain.CurrentDomain.BaseDirectory; + string phoneDict = await DownloadModelAsync("cmudict-0.7b_nv22.10"); + string heteronyms = await DownloadModelAsync("heteronyms-052722"); + var g2p = new EnglishG2p(phoneDict, heteronyms); + var tokenizer = new EnglishPhonemesTokenizer( + g2p, + punct: true, + stresses: true, + chars: true, + apostrophe: true, + padWithSpace: true, + addBlankAt: BaseTokenizer.AddBlankAt.True); + tokenizer.Encode("Hello world!"); + } + static async Task FramePredict(bool mbn) { string appDirPath = AppDomain.CurrentDomain.BaseDirectory; diff --git a/NeMoOnnxSharp.Tests/Data/cmudict-test b/NeMoOnnxSharp.Tests/Data/cmudict-test new file mode 100644 index 0000000..85f77af --- /dev/null +++ b/NeMoOnnxSharp.Tests/Data/cmudict-test @@ -0,0 +1,21 @@ +# Comment +YOU'VE Y UW1 V +READ R EH1 D +READ(1) R IY1 D +BOOK B UH1 K +THE DH AH0 +THE(1) DH AH1 +THE(2) DH IY0 +OPERATING AA1 P ER0 EY2 T IH0 NG +OPERATING(1) AO1 P ER0 EY2 T IH0 NG +SYSTEM S IH1 S T AH0 M +DESIGN D IH0 Z AY1 N +AND AH0 N D +AND(1) AE1 N D +IMPLEMENTATION IH2 M P L AH0 M EH0 N T EY1 SH AH0 N +THIRD TH ER1 D +EDITION AH0 D IH1 SH AH0 N +EDITION(1) IH0 D IH1 SH AH0 N +DID D IH1 D +DID(1) D IH0 D +YOU Y UW1 diff --git a/NeMoOnnxSharp.Tests/Data/heteronyms-test b/NeMoOnnxSharp.Tests/Data/heteronyms-test new file mode 100644 index 0000000..70705e7 --- /dev/null +++ b/NeMoOnnxSharp.Tests/Data/heteronyms-test @@ -0,0 +1 @@ +read \ No newline at end of file diff --git a/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj b/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj index c030242..1b2c870 100644 --- a/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj +++ b/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj @@ -45,6 +45,12 @@ + + PreserveNewest + + + PreserveNewest + Never diff --git a/NeMoOnnxSharp.Tests/TextTokenizersTest.cs b/NeMoOnnxSharp.Tests/TextTokenizersTest.cs new file mode 100644 index 0000000..1ee8503 --- /dev/null +++ b/NeMoOnnxSharp.Tests/TextTokenizersTest.cs @@ -0,0 +1,89 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using NeMoOnnxSharp.TTSTokenizers; +using System; +using System.Diagnostics; +using System.IO; + +namespace NeMoOnnxSharp.Tests +{ + [TestClass] + public class TextTokenizersTest + { + private readonly static string[] ExpectedTokens = + { + " ", "B", "CH", "D", "DH", "F", "G", "HH", "JH", "K", "L", "M", + "N", "NG", "P", "R", "S", "SH", "T", "TH", "V", "W", "Y", "Z", "ZH", + "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", + "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "EH0", "EH1", + "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "IH0", "IH1", "IH2", + "IY0", "IY1", "IY2", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "UH0", + "UH1", "UH2", "UW0", "UW1", "UW2", "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", + "v", "w", "x", "y", "z", "'", ",", ".", "!", "?", "-", ":", ";", "/", + "\"", "(", ")", "[", "]", "{", "}", "", "", "" + }; + + private const string SampleText = + "You've read the book “Operating Systems Design and Implementation, 3rd edition”. Did you?"; + private const string NormalizedSampleText = + "You've read the book “Operating Systems Design and Implementation, third edition”. Did you?"; + private const string SamplePronText = + "Y|UW1|V| |r|e|a|d| |t|h|e| |B|UH1|K| |“|o|p|e|r|a|t|i|n|g| |" + + "S|IH1|S|T|AH0|M|Z| |D|IH0|Z|AY1|N| |a|n|d| |IH2|M|P|L|AH0|" + + "M|EH0|N|T|EY1|SH|AH0|N|,| |TH|ER1|D| |e|d|i|t|i|o|n|”|.| |" + + "d|i|d| |Y|UW1|?"; + + private readonly static int[] SampleParsed = + { + 0, 22, 68, 20, 0, 87, 74, 70, 73, 0, 89, 77, 74, + 0, 1, 65, 9, 0, 105, 84, 85, 74, 87, 70, 89, 78, + 83, 76, 0, 16, 53, 16, 18, 31, 11, 23, 0, 3, 52, + 23, 41, 12, 0, 70, 83, 73, 0, 54, 11, 14, 10, 31, + 11, 43, 12, 18, 50, 17, 31, 12, 97, 0, 19, 47, 3, + 0, 74, 73, 78, 89, 78, 84, 83, 105, 98, 0, 73, 78, + 73, 0, 22, 68, 100, 0 + }; + + [TestInitialize] + public void Initialize() + { + string appDirPath = AppDomain.CurrentDomain.BaseDirectory; + _g2p = new EnglishG2p( + phonemeDict: Path.Combine(appDirPath, "Data", "cmudict-test"), + heteronyms: Path.Combine(appDirPath, "Data", "heteronyms-test"), + phonemeProbability: 1.0); + _tokenizer = new EnglishPhonemesTokenizer( + _g2p, + punct: true, + stresses: true, + chars: true, + apostrophe: true, + padWithSpace: true, + addBlankAt: BaseTokenizer.AddBlankAt.True); + } + + [TestMethod] + public void TestTokenizerVocab() + { + CollectionAssert.AreEquivalent(ExpectedTokens, _tokenizer.Tokens); + } + + [TestMethod] + public void TestEnglishG2p() + { + var pron = string.Join("|", _g2p.Parse(NormalizedSampleText)); + Assert.AreEqual(SamplePronText, pron); + } + + [TestMethod] + public void TestEnglishEncode() + { + var parsed = _tokenizer.Encode(NormalizedSampleText); + CollectionAssert.AreEquivalent(SampleParsed, parsed); + } + + private EnglishG2p? _g2p; + private EnglishPhonemesTokenizer? _tokenizer; + + } +} \ No newline at end of file diff --git a/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs b/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs new file mode 100644 index 0000000..1e97b60 --- /dev/null +++ b/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs @@ -0,0 +1,75 @@ +// Copyright (c) Katsuya Iida. All Rights Reserved. +// See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, +// largely located in the files found in this folder: +// +// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/NVIDIA/NeMo/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace NeMoOnnxSharp.TTSTokenizers +{ + public abstract class BaseTokenizer + { + public enum AddBlankAt + { + False, + True, + Last + } + + protected const string Pad = ""; + protected const string Blank = ""; + protected const string OOV = ""; + + protected BaseTokenizer() + { + _sep = string.Empty; + _id2token = Array.Empty(); + _token2id = new Dictionary(); + _utilIds = new HashSet(); + } + + /// + /// Turns str text into int tokens. + /// + public abstract int[] Encode(string text); + + /// + /// Turns ints tokens into str text. + /// + public string Decode(int[] tokens) + { + return string.Join( + _sep, + tokens + .Where(t => !_utilIds.Contains(t)) + .Select(t => _id2token[t])); + } + + public string[] Tokens { get { return _id2token; } } + public int PadId { get { return _pad; } } + public int BlankId { get { return _blank; } } + public int OOVId { get { return _oov; } } + public string Sep { get { return _sep; } } + + protected string[] _id2token; + protected IDictionary _token2id; + protected ISet _utilIds; + protected int _space; + protected int _pad; + protected int _blank; + protected int _oov; + protected string _sep; + protected bool _padWithSpace; + } +} diff --git a/NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs b/NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs new file mode 100644 index 0000000..559c9ec --- /dev/null +++ b/NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs @@ -0,0 +1,213 @@ +// Copyright (c) Katsuya Iida. All Rights Reserved. +// See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, +// largely located in the files found in this folder: +// +// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/g2p/models/en_us_arpabet.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/NVIDIA/NeMo/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace NeMoOnnxSharp.TTSTokenizers +{ + // nemo.collections.tts.torch.g2ps.EnglishG2p + + /// + /// English G2P module. This module converts words from grapheme to phoneme representation using phoneme_dict in CMU dict format. + /// Optionally, it can ignore words which are heteronyms, ambiguous or marked as unchangeable by word_tokenize_func(see code for details). + /// Ignored words are left unchanged or passed through apply_to_oov_word for handling. + /// + public class EnglishG2p + { + private readonly IDictionary _phonemeDict; + private readonly HashSet _heteronyms; + private readonly double _phonemeProbability; + private readonly Random _random; + private readonly Regex _alnumRx; + private readonly bool _ignoreAmbiguousWords; + + /// + /// Path to file in CMUdict format or dictionary of CMUdict-like entries. + /// word_tokenize_func: Function for tokenizing text to words. + /// Path to file with heteronyms (every line is new word) or list of words. + /// The probability (0. + public EnglishG2p( + string phonemeDict, + string heteronyms, + bool ignoreAmbiguousWords = true, + Encoding? encoding = null, + double phonemeProbability = 0.5) + { + encoding = encoding ?? Encoding.GetEncoding("iso-8859-1"); + _phonemeDict = _ParseAsCmuDict(phonemeDict, encoding); + _heteronyms = new HashSet(_ParseFileByLines(heteronyms, encoding)); + _phonemeProbability = phonemeProbability; + _random = new Random(); + _alnumRx = new Regex(@"[a-zA-ZÀ-ÿ\d]"); + _ignoreAmbiguousWords = ignoreAmbiguousWords; + } + + public string[] Parse(string text) + { + var words = TokenizerUtils.EnglishWordTokenize(text); + var prons = new List(); + foreach (var (word, withoutChanges) in words) + { + if (withoutChanges) + { + prons.AddRange(word); + continue; + } + + var wordStr = word[0]; + var wordByHyphen = wordStr.Split('-'); + var (pron, isHandled) = ParseOneWord(wordStr); + + if (!isHandled && wordByHyphen.Length > 1) + { + pron = new List(); + foreach (var subWord in wordByHyphen) + { + var (p, _) = ParseOneWord(subWord); + pron.AddRange(p); + pron.Add("-"); + } + pron.RemoveAt(pron.Count - 1); + } + prons.AddRange(pron); + } + return prons.ToArray(); + } + + private (List pron, bool isHandled) ParseOneWord(string word) + { + if (_phonemeProbability < 1.0 && _random.NextDouble() > _phonemeProbability) + { + return (StringToStringList(word), true); + } + + // punctuation or whitespace. + if (!_alnumRx.IsMatch(word)) + { + return (StringToStringList(word), true); + } + + // heteronyms + if (_heteronyms != null && _heteronyms.Contains(word)) + { + return (StringToStringList(word), true); + } + + // `'s` suffix + if (word.Length > 2 + && word.EndsWith("'s") + && !_phonemeDict.ContainsKey(word)) + { + var sword = word.Substring(0, word.Length - 2); + if (_phonemeDict.ContainsKey(sword) + && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword))) + { + var pron = _phonemeDict[sword][0].Split(" ").ToList(); + pron.Add("Z"); + return (pron, true); + } + } + + // `s` suffix + if (word.Length > 1 + && word.EndsWith("s") + && !_phonemeDict.ContainsKey(word)) + { + var sword = word.Substring(0, word.Length - 1); + if (_phonemeDict.ContainsKey(sword) + && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword))) + { + var pron = _phonemeDict[sword][0].Split(" ").ToList(); + pron.Add("Z"); + return (pron, true); + } + } + + // phoneme dict + if (_phonemeDict.ContainsKey(word) && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(word))) + { + return (_phonemeDict[word][0].Split(" ").ToList(), true); + } + + return (StringToStringList(word), false); + } + + private List StringToStringList(string word) + { + return word.Select(x => x.ToString()).ToList(); + } + + private bool _IsUniqueInPhonemeDict(string word) + { + return _phonemeDict[word].Length == 1; + } + + private static IDictionary _ParseAsCmuDict(string phonemeDictPath, Encoding encoding) + { + var _alt_re = new Regex(@"\([0-9]+\)"); + var g2pDict = new Dictionary(); + using (var stream = new FileStream(phonemeDictPath, FileMode.Open)) + using (var reader = new StreamReader(stream, encoding)) + { + string line; + while ((line = reader.ReadLine()) != null) + { + if (line.Length > 0 && (('A' <= line[0] && line[0] <= 'Z') || line[0] == '\'')) + { + var parts = line.Split(" "); + var word = _alt_re.Replace(parts[0], ""); + word = word.ToLower(); + + var pronunciation = parts[1].Trim(); + if (g2pDict.ContainsKey(word)) + { + var v = new List(g2pDict[word]) + { + pronunciation + }; + g2pDict[word] = v.ToArray(); + } + else + { + g2pDict[word] = new string[] { pronunciation }; + } + } + } + } + return g2pDict; + } + + private static string[] _ParseFileByLines(string p, Encoding encoding) + { + var res = new List(); + using (var stream = new FileStream(p, FileMode.Open)) + using (var reader = new StreamReader(stream, encoding)) + { + string line; + while ((line = reader.ReadLine()) != null) + { + res.Add(line.TrimEnd()); + } + } + return res.ToArray(); + } + } +} diff --git a/NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs b/NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs new file mode 100644 index 0000000..2fc6ad8 --- /dev/null +++ b/NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs @@ -0,0 +1,224 @@ +// Copyright (c) Katsuya Iida. All Rights Reserved. +// See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, +// largely located in the files found in this folder: +// +// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/NVIDIA/NeMo/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using static System.Net.Mime.MediaTypeNames; + +namespace NeMoOnnxSharp.TTSTokenizers +{ + // nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer + public class EnglishPhonemesTokenizer : BaseTokenizer + { + /// + /// English phoneme-based tokenizer. + /// + /// Grapheme to phoneme module. + /// Whether to reserve grapheme for basic punctuation or not. + /// List of punctuation marks which will be used instead default. + /// Whether to use phonemes codes with stresses (0-2) or not. + /// Whether to additionally use chars together with phonemes. It is useful if g2p module can return chars too. + /// Space token as string. + /// Silence token as string (will be disabled if it is None). + /// Whether to use apostrophe or not. + /// OOV token as string. + /// Separation token as string. + /// Add blank to labels in the specified order ("last") or after tokens (any non None), + /// if None then no blank in labels. + /// Whether to pad text with spaces at the beginning and at the end or not. + /// text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. + /// Basically, it replaces all non-unicode characters with unicode ones. + /// Note that lower() function shouldn't be applied here, in case the text contains phonemes (it will be handled by g2p). + public EnglishPhonemesTokenizer( + EnglishG2p g2p, + bool punct = true, + string[]? nonDefaultPunctList = null, + bool stresses = false, + bool chars = false, + string space = " ", + string? silence = null, + bool apostrophe = true, + string oov = BaseTokenizer.OOV, + string sep = "|", // To be able to distinguish between 2/3 letters codes. + AddBlankAt addBlankAt = AddBlankAt.False, + bool padWithSpace = false) + // object? text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=false), + { + _phonemeProbability = null; + _g2p = g2p; + _space = 0; + var tokens = new List(); + tokens.Add(space); + + if (silence != null) + { + throw new NotImplementedException(); + } + + tokens.AddRange(Consonants); + var vowels = Vowels; + + if (stresses) + { + vowels = vowels.SelectMany(p => Enumerable.Range(0, 3), (p, s) => $"{p}{s}").ToArray(); + } + tokens.AddRange(vowels); + + if (chars || _phonemeProbability != null) + { + if (!chars) + { + // logging.warning( + // "phoneme_probability was not None, characters will be enabled even though " + // "chars was set to False." + // ); + } + tokens.AddRange(AsciiLowercase.Select(ch => ch.ToString())); + } + + if (apostrophe) + { + tokens.Add("'"); // Apostrophe + } + + if (punct) + { + if (nonDefaultPunctList != null) + { + tokens.AddRange(nonDefaultPunctList); + } + else + { + tokens.AddRange(PunctList); + } + } + + tokens.Add(Pad); + _pad = tokens.Count; + if (addBlankAt != AddBlankAt.True) + { + throw new NotImplementedException(); + } + tokens.Add(Blank); + _blank = tokens.Count; + tokens.Add(oov); // Out Of Vocabulary + _oov = tokens.Count; + + _sep = sep; + _padWithSpace = padWithSpace; + + _id2token = tokens.ToArray(); + _token2id = new Dictionary( + Enumerable.Range(0, _id2token.Length) + .Select(i => new KeyValuePair(_id2token[i], i))); + _utilIds = new HashSet() { _pad, _blank, _oov }; + + _stresses = stresses; + _punct = punct; + } + + public override int[] Encode(string text) + { + text = TokenizerUtils.EnglishTextPreprocessing(text); + var g2pText = _g2p.Parse(text); + return EncodeFromG2p(g2pText); + } + + /// + /// Encodes text that has already been run through G2P. + /// Called for encoding to tokens after text preprocessing and G2P. + /// + /// G2P's output, could be a mixture of phonemes and graphemes, + /// e.g. "see OOV" -> ['S', 'IY1', ' ', 'O', 'O', 'V'] + /// + public int[] EncodeFromG2p(string[] g2pText) + { + var ps = new List(); + var space = _id2token[_space]; + foreach (var _p in g2pText) + { + string p = _p; + // Remove stress + if (p.Length == 3 && !_stresses) + { + p = p.Substring(0, 2); + } + + // Add space if last one isn't one + if (p == space && ps.Count > 0 && ps[ps.Count - 1] != space) + { + ps.Add(p); + } + // Add next phoneme or char (if chars=true) + else if ((char.IsLetterOrDigit(p, 0) || p == "'") && _token2id.ContainsKey(p)) + { + ps.Add(p); + } + // Add punct + else if (_punct && !char.IsLetterOrDigit(p, 0) && _token2id.ContainsKey(p)) + { + ps.Add(p); + } + else if (p != space) + { + // Unknown char/phoneme + } + } + + // Remove trailing spaces + while (ps.Count > 0 && ps[ps.Count - 1] == space) + { + ps.RemoveAt(ps.Count - 1); + } + + var res = new List(); + if (_padWithSpace) + { + res.Add(0); + } + res.AddRange(g2pText.Select(p => _token2id[p])); + if (_padWithSpace) + { + res.Add(0); + } + return res.ToArray(); + } + + private readonly string[] PunctList = + { // Derived from LJSpeech and "/" additionally + ",", ".", "!", "?", "-", + ":", ";", "/", "\"", "(", + ")", "[", "]", "{", "}", + }; + private readonly string[] Vowels = { + "AA", "AE", "AH", "AO", "AW", + "AY", "EH", "ER", "EY", "IH", + "IY", "OW", "OY", "UH", "UW", + }; + private readonly string[] Consonants = { + "B", "CH", "D", "DH", "F", "G", + "HH", "JH", "K", "L", "M", "N", + "NG", "P", "R", "S", "SH", "T", + "TH", "V", "W", "Y", "Z", "ZH", + }; + + private const string AsciiLowercase = "abcdefghijklmnopqrstuvwxyz"; + + private readonly EnglishG2p _g2p; + private readonly object? _phonemeProbability; + private readonly bool _stresses; + private readonly bool _punct; + } +} diff --git a/NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs b/NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs new file mode 100644 index 0000000..23b7796 --- /dev/null +++ b/NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs @@ -0,0 +1,147 @@ +// Copyright (c) Katsuya Iida. All Rights Reserved. +// See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, +// largely located in the files found in this folder: +// +// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/NVIDIA/NeMo/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; +using System.Data.SqlTypes; +using System.Globalization; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace NeMoOnnxSharp.TTSTokenizers +{ + public static class TokenizerUtils + { + private static readonly Dictionary _synoGlyph2Ascii; + private static readonly Regex _wordsReEn; + + static TokenizerUtils() + { + Tuple[] _synoglyphs = { + new Tuple('\'', new[] { '’' }), + new Tuple('"', new[] { '”', '“' }), + }; + + _synoGlyph2Ascii = new Dictionary(); + foreach (var (asc, glyphs) in _synoglyphs) + { + foreach (var g in glyphs) + { + _synoGlyph2Ascii[g] = asc; + } + } + + // define char set based on https://en.wikipedia.org/wiki/List_of_Unicode_characters + var latinAlphabetBasic = "A-Za-z"; + _wordsReEn = new Regex(@$"([{latinAlphabetBasic}]+(?:[{latinAlphabetBasic}\-']*[{latinAlphabetBasic}]+)*)|(\|[^|]*\|)|([^{latinAlphabetBasic}|]+)"); + } + + public static string EnglishTextPreprocessing(string text, bool lower = true) + { + text = new string( + text.Normalize(NormalizationForm.FormD) + .Where(ch => CharUnicodeInfo.GetUnicodeCategory(ch) != UnicodeCategory.NonSpacingMark) + .Select(ch => _synoGlyph2Ascii.ContainsKey(ch) ? _synoGlyph2Ascii[ch] : ch) + .ToArray()); + + if (lower) + { + text = text.ToLower(); + } + return text; + } + + /// + /// Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation + /// can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including + /// whitespaces.This function will split unchanged strings by whitespaces and return them as `List[str]`. For example, + /// + /// .. code-block::python + /// [ + /// ('Hello', '', ''), # valid word + /// ('', '', ' '), # punctuation mark + /// ('World', '', ''), # valid word + /// ('', '', ' '), # punctuation mark + /// ('', '|NVIDIA unchanged|', ''), # unchangeable word + /// ('', '', '!') # punctuation mark + /// ] + /// + /// will be converted into, + /// + /// .. code-block::python + /// [ + /// (["Hello"], false), + /// ([" "], false), + /// (["World"], false), + /// ([" "], false), + /// (["NVIDIA", "unchanged"], True), + /// (["!"], false) + /// ] + /// + /// a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element + /// corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`. + /// a flag to trigger lowercase all words. By default, it is false. + /// a list of tuples like `(a list of words, is_unchanged)`. + private static (string[], bool)[] _wordTokenize(MatchCollection words, bool isLower = false) + { + var result = new List<(string[], bool)>(); + foreach (Match word in words) + { + var maybeWord = word.Groups[0].Value; + var maybeWithoutChanges = word.Groups[1].Value; + var maybePunct = word.Groups[2].Value; + + var withoutChanges = false; + string[] token; + if (!string.IsNullOrEmpty(maybeWord)) + { + if (isLower) + { + token = new[] { maybeWord.ToLower() }; + } + else + { + token = new[] { maybeWord }; + } + } + else if (!string.IsNullOrEmpty(maybePunct)) + { + token = new[] { maybePunct }; + } + else if (!string.IsNullOrEmpty(maybeWithoutChanges)) + { + withoutChanges = true; + token = maybeWithoutChanges.Substring(1, maybeWithoutChanges.Length - 2).Split(' '); + } + else + { + throw new InvalidDataException( + $"This is not expected. Found empty string: <{word}>. " + + $"Please validate your regular expression pattern '_WORDS_RE_EN' or '_WORDS_RE_ANY_LOCALE'." + ); + } + + result.Add((token, withoutChanges)); + } + return result.ToArray(); + } + + public static (string[], bool)[] EnglishWordTokenize(string text) + { + var words = _wordsReEn.Matches(text); + return _wordTokenize(words, isLower: true); + } + } +}