From 4ad71cb525dfd6704a53f3dd98e76a6c5f6cd8b3 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 30 Sep 2023 14:04:15 +0900
Subject: [PATCH] Add English G2P and tokenizer. (#15)

---
 NeMoOnnxSharp.Example/PretrainedModelInfo.cs  |  10 +
 NeMoOnnxSharp.Example/Program.cs              |  22 ++
 NeMoOnnxSharp.Tests/Data/cmudict-test         |  21 ++
 NeMoOnnxSharp.Tests/Data/heteronyms-test      |   1 +
 .../NeMoOnnxSharp.Tests.csproj                |   6 +
 NeMoOnnxSharp.Tests/TextTokenizersTest.cs     |  89 +++++++
 NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs  |  75 ++++++
 NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs     | 213 +++++++++++++++++
 .../TTSTokenizers/EnglishPhonemesTokenizer.cs | 224 ++++++++++++++++++
 NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs | 147 ++++++++++++
 10 files changed, 808 insertions(+)
 create mode 100644 NeMoOnnxSharp.Tests/Data/cmudict-test
 create mode 100644 NeMoOnnxSharp.Tests/Data/heteronyms-test
 create mode 100644 NeMoOnnxSharp.Tests/TextTokenizersTest.cs
 create mode 100644 NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs
 create mode 100644 NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs
 create mode 100644 NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs
 create mode 100644 NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs

diff --git a/NeMoOnnxSharp.Example/PretrainedModelInfo.cs b/NeMoOnnxSharp.Example/PretrainedModelInfo.cs
index 999722f..ed2b6ad 100644
--- a/NeMoOnnxSharp.Example/PretrainedModelInfo.cs
+++ b/NeMoOnnxSharp.Example/PretrainedModelInfo.cs
@@ -45,6 +45,16 @@ private static PretrainedModelInfo[] CreateModelList()
                     "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/commandrecognition_en_matchboxnet3x1x64_v2.onnx",
                     "a0c5e4d14e83d3b6afdaf239265a390c2ca513bcdedf3d295bc1f9f97f19868a"
                 ),
+                new PretrainedModelInfo(
+                    "cmudict-0.7b_nv22.10",
+                    "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/cmudict-0.7b_nv22.10",
+                    "d330f3a3554d4c7ff8ef7bfc0c338ed74831d5f54109508fb829bdd82173608b"
+                ),
+                new PretrainedModelInfo(
+                    "heteronyms-052722",
+                    "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/heteronyms-052722",
+                    "b701909aedf753172eff223950f8859cd4b9b4c80199cf0a6e9ac4a307c8f8ec"
+                ),
                 new PretrainedModelInfo(
                     "tts_en_hifigan",
                     "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_hifigan.onnx",
diff --git a/NeMoOnnxSharp.Example/Program.cs b/NeMoOnnxSharp.Example/Program.cs
index 1ccf991..0bf9cb2 100644
--- a/NeMoOnnxSharp.Example/Program.cs
+++ b/NeMoOnnxSharp.Example/Program.cs
@@ -6,6 +6,7 @@
 using System.Text;
 using System.Threading.Tasks;
 using System.Collections.Generic;
+using NeMoOnnxSharp.TTSTokenizers;
 
 namespace NeMoOnnxSharp.Example
 {
@@ -19,6 +20,10 @@ static async Task Main(string[] args)
             {
                 await Transcribe();
             }
+            else if (task == "speak")
+            {
+                await Speak();
+            }
             else if (task == "vad")
             {
                 await FramePredict(false);
@@ -54,6 +59,23 @@ static async Task Transcribe()
             }
         }
 
+        static async Task Speak()
+        {
+            string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
+            string phoneDict = await DownloadModelAsync("cmudict-0.7b_nv22.10");
+            string heteronyms = await DownloadModelAsync("heteronyms-052722");
+            var g2p = new EnglishG2p(phoneDict, heteronyms);
+            var tokenizer = new EnglishPhonemesTokenizer(
+                g2p,
+                punct: true,
+                stresses: true,
+                chars: true,
+                apostrophe: true,
+                padWithSpace: true,
+                addBlankAt: BaseTokenizer.AddBlankAt.True);
+            tokenizer.Encode("Hello world!");
+        }
+
         static async Task FramePredict(bool mbn)
         {
             string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
diff --git a/NeMoOnnxSharp.Tests/Data/cmudict-test b/NeMoOnnxSharp.Tests/Data/cmudict-test
new file mode 100644
index 0000000..85f77af
--- /dev/null
+++ b/NeMoOnnxSharp.Tests/Data/cmudict-test
@@ -0,0 +1,21 @@
+# Comment
+YOU'VE  Y UW1 V
+READ  R EH1 D
+READ(1)  R IY1 D
+BOOK  B UH1 K
+THE  DH AH0
+THE(1)  DH AH1
+THE(2)  DH IY0
+OPERATING  AA1 P ER0 EY2 T IH0 NG
+OPERATING(1)  AO1 P ER0 EY2 T IH0 NG
+SYSTEM  S IH1 S T AH0 M
+DESIGN  D IH0 Z AY1 N
+AND  AH0 N D
+AND(1)  AE1 N D
+IMPLEMENTATION  IH2 M P L AH0 M EH0 N T EY1 SH AH0 N
+THIRD  TH ER1 D
+EDITION  AH0 D IH1 SH AH0 N
+EDITION(1)  IH0 D IH1 SH AH0 N
+DID  D IH1 D
+DID(1)  D IH0 D
+YOU  Y UW1
diff --git a/NeMoOnnxSharp.Tests/Data/heteronyms-test b/NeMoOnnxSharp.Tests/Data/heteronyms-test
new file mode 100644
index 0000000..70705e7
--- /dev/null
+++ b/NeMoOnnxSharp.Tests/Data/heteronyms-test
@@ -0,0 +1 @@
+read
\ No newline at end of file
diff --git a/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj b/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj
index c030242..1b2c870 100644
--- a/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj
+++ b/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj
@@ -45,6 +45,12 @@
   </ItemGroup>
 
   <ItemGroup>
+    <None Update="Data\cmudict-test">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+    <None Update="Data\heteronyms-test">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
     <None Update="Data\make_test.py">
       <CopyToOutputDirectory>Never</CopyToOutputDirectory>
     </None>
diff --git a/NeMoOnnxSharp.Tests/TextTokenizersTest.cs b/NeMoOnnxSharp.Tests/TextTokenizersTest.cs
new file mode 100644
index 0000000..1ee8503
--- /dev/null
+++ b/NeMoOnnxSharp.Tests/TextTokenizersTest.cs
@@ -0,0 +1,89 @@
+﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
+using NeMoOnnxSharp.TTSTokenizers;
+using System;
+using System.Diagnostics;
+using System.IO;
+
+namespace NeMoOnnxSharp.Tests
+{
+    [TestClass]
+    public class TextTokenizersTest
+    {
+        private readonly static string[] ExpectedTokens =
+        {
+            " ", "B", "CH", "D", "DH", "F", "G", "HH", "JH", "K", "L", "M",
+            "N", "NG", "P", "R", "S", "SH", "T", "TH", "V", "W", "Y", "Z", "ZH",
+            "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0",
+            "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "EH0", "EH1",
+            "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "IH0", "IH1", "IH2",
+            "IY0", "IY1", "IY2", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "UH0",
+            "UH1", "UH2", "UW0", "UW1", "UW2", "a", "b", "c", "d", "e", "f", "g",
+            "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
+            "v", "w", "x", "y", "z", "'", ",", ".", "!", "?", "-", ":", ";", "/",
+            "\"", "(", ")", "[", "]", "{", "}", "<pad>", "<blank>", "<oov>"
+        };
+
+        private const string SampleText =
+            "You've read the book “Operating Systems Design and Implementation, 3rd edition”. Did you?";
+        private const string NormalizedSampleText =
+            "You've read the book “Operating Systems Design and Implementation, third edition”. Did you?";
+        private const string SamplePronText =
+            "Y|UW1|V| |r|e|a|d| |t|h|e| |B|UH1|K| |“|o|p|e|r|a|t|i|n|g| |"
+            + "S|IH1|S|T|AH0|M|Z| |D|IH0|Z|AY1|N| |a|n|d| |IH2|M|P|L|AH0|"
+            + "M|EH0|N|T|EY1|SH|AH0|N|,| |TH|ER1|D| |e|d|i|t|i|o|n|”|.| |"
+            + "d|i|d| |Y|UW1|?";
+
+        private readonly static int[] SampleParsed =
+        {
+             0,  22,  68,  20,   0,  87,  74,  70,  73,   0,  89,  77,  74,
+             0,   1,  65,   9,   0, 105,  84,  85,  74,  87,  70,  89,  78,
+            83,  76,   0,  16,  53,  16,  18,  31,  11,  23,   0,   3,  52,
+            23,  41,  12,   0,  70,  83,  73,   0,  54,  11,  14,  10,  31,
+            11,  43,  12,  18,  50,  17,  31,  12,  97,   0,  19,  47,   3,
+             0,  74,  73,  78,  89,  78,  84,  83, 105,  98,   0,  73,  78,
+            73,   0,  22,  68, 100,   0
+        };
+
+        [TestInitialize]
+        public void Initialize()
+        {
+            string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
+            _g2p = new EnglishG2p(
+                phonemeDict: Path.Combine(appDirPath, "Data", "cmudict-test"),
+                heteronyms: Path.Combine(appDirPath, "Data", "heteronyms-test"),
+                phonemeProbability: 1.0);
+            _tokenizer = new EnglishPhonemesTokenizer(
+                _g2p,
+                punct: true,
+                stresses: true,
+                chars: true,
+                apostrophe: true,
+                padWithSpace: true,
+                addBlankAt: BaseTokenizer.AddBlankAt.True);
+        }
+
+        [TestMethod]
+        public void TestTokenizerVocab()
+        {
+            CollectionAssert.AreEquivalent(ExpectedTokens, _tokenizer.Tokens);
+        }
+
+        [TestMethod]
+        public void TestEnglishG2p()
+        {
+            var pron = string.Join("|", _g2p.Parse(NormalizedSampleText));
+            Assert.AreEqual(SamplePronText, pron);
+        }
+
+        [TestMethod]
+        public void TestEnglishEncode()
+        {
+            var parsed = _tokenizer.Encode(NormalizedSampleText);
+            CollectionAssert.AreEquivalent(SampleParsed, parsed);
+        }
+
+        private EnglishG2p? _g2p;
+        private EnglishPhonemesTokenizer? _tokenizer;
+
+    }
+}
\ No newline at end of file
diff --git a/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs b/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs
new file mode 100644
index 0000000..1e97b60
--- /dev/null
+++ b/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs
@@ -0,0 +1,75 @@
+﻿// Copyright (c) Katsuya Iida.  All Rights Reserved.
+// See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
+// largely located in the files found in this folder:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace NeMoOnnxSharp.TTSTokenizers
+{
+    public abstract class BaseTokenizer
+    {
+        public enum AddBlankAt
+        {
+            False,
+            True,
+            Last
+        }
+
+        protected const string Pad = "<pad>";
+        protected const string Blank = "<blank>";
+        protected const string OOV = "<oov>";
+
+        protected BaseTokenizer()
+        {
+            _sep = string.Empty;
+            _id2token = Array.Empty<string>();
+            _token2id = new Dictionary<string, int>();
+            _utilIds = new HashSet<int>();
+        }
+
+        /// <summary>
+        /// Turns str text into int tokens.
+        /// </summary>
+        public abstract int[] Encode(string text);
+
+        /// <summary>
+        /// Turns ints tokens into str text.
+        /// </summary>
+        public string Decode(int[] tokens)
+        {
+            return string.Join(
+                _sep,
+                tokens
+                .Where(t => !_utilIds.Contains(t))
+                .Select(t => _id2token[t]));
+        }
+
+        public string[] Tokens { get { return _id2token; } }
+        public int PadId { get { return _pad; } }
+        public int BlankId { get { return _blank; } }
+        public int OOVId { get { return _oov; } }
+        public string Sep { get { return _sep; } }
+
+        protected string[] _id2token;
+        protected IDictionary<string, int> _token2id;
+        protected ISet<int> _utilIds;
+        protected int _space;
+        protected int _pad;
+        protected int _blank;
+        protected int _oov;
+        protected string _sep;
+        protected bool _padWithSpace;
+    }
+}
diff --git a/NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs b/NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs
new file mode 100644
index 0000000..559c9ec
--- /dev/null
+++ b/NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs
@@ -0,0 +1,213 @@
+﻿// Copyright (c) Katsuya Iida.  All Rights Reserved.
+// See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
+// largely located in the files found in this folder:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/g2p/models/en_us_arpabet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace NeMoOnnxSharp.TTSTokenizers
+{
+    // nemo.collections.tts.torch.g2ps.EnglishG2p
+
+    /// <summary>
+    /// English G2P module. This module converts words from grapheme to phoneme representation using phoneme_dict in CMU dict format.
+    /// Optionally, it can ignore words which are heteronyms, ambiguous or marked as unchangeable by word_tokenize_func(see code for details).
+    /// Ignored words are left unchanged or passed through apply_to_oov_word for handling.
+    /// </summary>
+    public class EnglishG2p
+    {
+        private readonly IDictionary<string, string[]> _phonemeDict;
+        private readonly HashSet<string> _heteronyms;
+        private readonly double _phonemeProbability;
+        private readonly Random _random;
+        private readonly Regex _alnumRx;
+        private readonly bool _ignoreAmbiguousWords;
+
+        /// </summary>
+        /// <param name="phonemeDict">Path to file in CMUdict format or dictionary of CMUdict-like entries.</param>
+        /// word_tokenize_func: Function for tokenizing text to words.
+        /// <param name="heteronyms">Path to file with heteronyms (every line is new word) or list of words.</param>
+        /// <param name="phonemeProbability">The probability (0.<var<1.) that each word is phonemized.Defaults to None which is the same as 1.
+        /// Note that this code path is only run if the word can be phonemized.For example: If the word does not have an entry in the g2p dict, it will be returned
+        /// as characters.If the word has multiple entries and ignore_ambiguous_words is True, it will be returned as characters.
+        /// </param>
+        public EnglishG2p(
+            string phonemeDict,
+            string heteronyms,
+            bool ignoreAmbiguousWords = true,
+            Encoding? encoding = null,
+            double phonemeProbability = 0.5)
+        {
+            encoding = encoding ?? Encoding.GetEncoding("iso-8859-1");
+            _phonemeDict = _ParseAsCmuDict(phonemeDict, encoding);
+            _heteronyms = new HashSet<string>(_ParseFileByLines(heteronyms, encoding));
+            _phonemeProbability = phonemeProbability;
+            _random = new Random();
+            _alnumRx = new Regex(@"[a-zA-ZÀ-ÿ\d]");
+            _ignoreAmbiguousWords = ignoreAmbiguousWords;
+        }
+
+        public string[] Parse(string text)
+        {
+            var words = TokenizerUtils.EnglishWordTokenize(text);
+            var prons = new List<string>();
+            foreach (var (word, withoutChanges) in words)
+            {
+                if (withoutChanges)
+                {
+                    prons.AddRange(word);
+                    continue;
+                }
+
+                var wordStr = word[0];
+                var wordByHyphen = wordStr.Split('-');
+                var (pron, isHandled) = ParseOneWord(wordStr);
+
+                if (!isHandled && wordByHyphen.Length > 1)
+                {
+                    pron = new List<string>();
+                    foreach (var subWord in wordByHyphen)
+                    {
+                        var (p, _) = ParseOneWord(subWord);
+                        pron.AddRange(p);
+                        pron.Add("-");
+                    }
+                    pron.RemoveAt(pron.Count - 1);
+                }
+                prons.AddRange(pron);
+            }
+            return prons.ToArray();
+        }
+
+        private (List<string> pron, bool isHandled) ParseOneWord(string word)
+        {
+            if (_phonemeProbability < 1.0 && _random.NextDouble() > _phonemeProbability)
+            {
+                return (StringToStringList(word), true);
+            }
+
+            // punctuation or whitespace.
+            if (!_alnumRx.IsMatch(word))
+            {
+                return (StringToStringList(word), true);
+            }
+
+            // heteronyms
+            if (_heteronyms != null && _heteronyms.Contains(word))
+            {
+                return (StringToStringList(word), true);
+            }
+
+            // `'s` suffix
+            if (word.Length > 2
+                && word.EndsWith("'s")
+                && !_phonemeDict.ContainsKey(word))
+            {
+                var sword = word.Substring(0, word.Length - 2);
+                if (_phonemeDict.ContainsKey(sword)
+                    && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword)))
+                {
+                    var pron = _phonemeDict[sword][0].Split(" ").ToList();
+                    pron.Add("Z");
+                    return (pron, true);
+                }
+            }
+
+            // `s` suffix
+            if (word.Length > 1
+                && word.EndsWith("s")
+                && !_phonemeDict.ContainsKey(word))
+            {
+                var sword = word.Substring(0, word.Length - 1);
+                if (_phonemeDict.ContainsKey(sword)
+                    && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword)))
+                {
+                    var pron = _phonemeDict[sword][0].Split(" ").ToList();
+                    pron.Add("Z");
+                    return (pron, true);
+                }
+            }
+
+            // phoneme dict
+            if (_phonemeDict.ContainsKey(word) && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(word)))
+            {
+                return (_phonemeDict[word][0].Split(" ").ToList(), true);
+            }
+
+            return (StringToStringList(word), false);
+        }
+
+        private List<string> StringToStringList(string word)
+        {
+            return word.Select(x => x.ToString()).ToList();
+        }
+
+        private bool _IsUniqueInPhonemeDict(string word)
+        {
+            return _phonemeDict[word].Length == 1;
+        }
+
+        private static IDictionary<string, string[]> _ParseAsCmuDict(string phonemeDictPath, Encoding encoding)
+        {
+            var _alt_re = new Regex(@"\([0-9]+\)");
+            var g2pDict = new Dictionary<string, string[]>();
+            using (var stream = new FileStream(phonemeDictPath, FileMode.Open))
+            using (var reader = new StreamReader(stream, encoding))
+            {
+                string line;
+                while ((line = reader.ReadLine()) != null)
+                {
+                    if (line.Length > 0 && (('A' <= line[0] && line[0] <= 'Z') || line[0] == '\''))
+                    {
+                        var parts = line.Split("  ");
+                        var word = _alt_re.Replace(parts[0], "");
+                        word = word.ToLower();
+
+                        var pronunciation = parts[1].Trim();
+                        if (g2pDict.ContainsKey(word))
+                        {
+                            var v = new List<string>(g2pDict[word])
+                            {
+                                pronunciation
+                            };
+                            g2pDict[word] = v.ToArray();
+                        }
+                        else
+                        {
+                            g2pDict[word] = new string[] { pronunciation };
+                        }
+                    }
+                }
+            }
+            return g2pDict;
+        }
+
+        private static string[] _ParseFileByLines(string p, Encoding encoding)
+        {
+            var res = new List<string>();
+            using (var stream = new FileStream(p, FileMode.Open))
+            using (var reader = new StreamReader(stream, encoding))
+            {
+                string line;
+                while ((line = reader.ReadLine()) != null)
+                {
+                    res.Add(line.TrimEnd());
+                }
+            }
+            return res.ToArray();
+        }
+    }
+}
diff --git a/NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs b/NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs
new file mode 100644
index 0000000..2fc6ad8
--- /dev/null
+++ b/NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs
@@ -0,0 +1,224 @@
+﻿// Copyright (c) Katsuya Iida.  All Rights Reserved.
+// See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
+// largely located in the files found in this folder:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using static System.Net.Mime.MediaTypeNames;
+
+namespace NeMoOnnxSharp.TTSTokenizers
+{
+    // nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
+    public class EnglishPhonemesTokenizer : BaseTokenizer
+    {
+        /// <summary>
+        /// English phoneme-based tokenizer.
+        /// </summary>
+        /// <param name="g2p">Grapheme to phoneme module.</param>
+        /// <param name="punct">Whether to reserve grapheme for basic punctuation or not.</param>
+        /// <param name="nonDefaultPunctList">List of punctuation marks which will be used instead default.</param>
+        /// <param name="stresses">Whether to use phonemes codes with stresses (0-2) or not.</param>
+        /// <param name="chars">Whether to additionally use chars together with phonemes. It is useful if g2p module can return chars too.</param>
+        /// <param name="space">Space token as string.</param>
+        /// <param name="silence">Silence token as string (will be disabled if it is None).</param>
+        /// <param name="apostrophe">Whether to use apostrophe or not.</param>
+        /// <param name="oov">OOV token as string.</param>
+        /// <param name="sep">Separation token as string.</param>
+        /// <param name="addBlankAt">Add blank to labels in the specified order ("last") or after tokens (any non None),
+        ///     if None then no blank in labels.</param>
+        /// <param name="padWithSpace">Whether to pad text with spaces at the beginning and at the end or not.
+        ///    text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
+        ///     Basically, it replaces all non-unicode characters with unicode ones.
+        ///     Note that lower() function shouldn't be applied here, in case the text contains phonemes (it will be handled by g2p).</param>
+        public EnglishPhonemesTokenizer(
+            EnglishG2p g2p,
+            bool punct = true,
+            string[]? nonDefaultPunctList = null,
+            bool stresses = false,
+            bool chars = false,
+            string space = " ",
+            string? silence = null,
+            bool apostrophe = true,
+            string oov = BaseTokenizer.OOV,
+            string sep = "|",  // To be able to distinguish between 2/3 letters codes.
+            AddBlankAt addBlankAt = AddBlankAt.False,
+            bool padWithSpace = false)
+        // object? text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=false),
+        {
+            _phonemeProbability = null;
+            _g2p = g2p;
+            _space = 0;
+            var tokens = new List<string>();
+            tokens.Add(space);
+
+            if (silence != null)
+            {
+                throw new NotImplementedException();
+            }
+
+            tokens.AddRange(Consonants);
+            var vowels = Vowels;
+
+            if (stresses)
+            {
+                vowels = vowels.SelectMany(p => Enumerable.Range(0, 3), (p, s) => $"{p}{s}").ToArray();
+            }
+            tokens.AddRange(vowels);
+
+            if (chars || _phonemeProbability != null)
+            {
+                if (!chars)
+                {
+                    // logging.warning(
+                    //     "phoneme_probability was not None, characters will be enabled even though "
+                    //     "chars was set to False."
+                    // );
+                }
+                tokens.AddRange(AsciiLowercase.Select(ch => ch.ToString()));
+            }
+
+            if (apostrophe)
+            {
+                tokens.Add("'");  // Apostrophe
+            }
+
+            if (punct)
+            {
+                if (nonDefaultPunctList != null)
+                {
+                    tokens.AddRange(nonDefaultPunctList);
+                }
+                else
+                {
+                    tokens.AddRange(PunctList);
+                }
+            }
+
+            tokens.Add(Pad);
+            _pad = tokens.Count;
+            if (addBlankAt != AddBlankAt.True)
+            {
+                throw new NotImplementedException();
+            }
+            tokens.Add(Blank);
+            _blank = tokens.Count;
+            tokens.Add(oov);  // Out Of Vocabulary
+            _oov = tokens.Count;
+
+            _sep = sep;
+            _padWithSpace = padWithSpace;
+
+            _id2token = tokens.ToArray();
+            _token2id = new Dictionary<string, int>(
+                Enumerable.Range(0, _id2token.Length)
+                .Select(i => new KeyValuePair<string, int>(_id2token[i], i)));
+            _utilIds = new HashSet<int>() { _pad, _blank, _oov };
+
+            _stresses = stresses;
+            _punct = punct;
+        }
+
+        public override int[] Encode(string text)
+        {
+            text = TokenizerUtils.EnglishTextPreprocessing(text);
+            var g2pText = _g2p.Parse(text);
+            return EncodeFromG2p(g2pText);
+        }
+
+        /// <summary>
+        /// Encodes text that has already been run through G2P.
+        /// Called for encoding to tokens after text preprocessing and G2P.
+        /// </summary>
+        /// <param name="g2pText">G2P's output, could be a mixture of phonemes and graphemes,
+        ///        e.g. "see OOV" -> ['S', 'IY1', ' ', 'O', 'O', 'V']</param>
+        /// <returns></returns>
+        public int[] EncodeFromG2p(string[] g2pText)
+        {
+            var ps = new List<string>();
+            var space = _id2token[_space];
+            foreach (var _p in g2pText)
+            {
+                string p = _p;
+                // Remove stress
+                if (p.Length == 3 && !_stresses)
+                {
+                    p = p.Substring(0, 2);
+                }
+
+                // Add space if last one isn't one
+                if (p == space && ps.Count > 0 && ps[ps.Count - 1] != space)
+                {
+                    ps.Add(p);
+                }
+                // Add next phoneme or char (if chars=true)
+                else if ((char.IsLetterOrDigit(p, 0) || p == "'") && _token2id.ContainsKey(p))
+                {
+                    ps.Add(p); 
+                }
+                // Add punct
+                else if (_punct && !char.IsLetterOrDigit(p, 0) && _token2id.ContainsKey(p))
+                {
+                    ps.Add(p);
+                }
+                else if (p != space)
+                {
+                    // Unknown char/phoneme
+                }
+            }
+
+            // Remove trailing spaces
+            while (ps.Count > 0 && ps[ps.Count - 1] == space)
+            {
+                ps.RemoveAt(ps.Count - 1);
+            }
+
+            var res = new List<int>();
+            if (_padWithSpace)
+            {
+                res.Add(0);
+            }
+            res.AddRange(g2pText.Select(p => _token2id[p]));
+            if (_padWithSpace)
+            {
+                res.Add(0);
+            }
+            return res.ToArray();
+        }
+
+        private readonly string[] PunctList =
+        {  // Derived from LJSpeech and "/" additionally
+            ",", ".", "!", "?", "-",
+            ":", ";", "/", "\"", "(",
+            ")", "[", "]", "{", "}",
+        };
+        private readonly string[] Vowels = {
+            "AA", "AE", "AH", "AO", "AW",
+            "AY", "EH", "ER", "EY", "IH",
+            "IY", "OW", "OY", "UH", "UW",
+        };
+        private readonly string[] Consonants = {
+            "B", "CH", "D", "DH", "F", "G",
+            "HH", "JH", "K", "L", "M", "N",
+            "NG", "P", "R", "S", "SH", "T",
+            "TH", "V", "W", "Y", "Z", "ZH",
+        };
+
+        private const string AsciiLowercase = "abcdefghijklmnopqrstuvwxyz";
+
+        private readonly EnglishG2p _g2p;
+        private readonly object? _phonemeProbability;
+        private readonly bool _stresses;
+        private readonly bool _punct;
+    }
+}
diff --git a/NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs b/NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs
new file mode 100644
index 0000000..23b7796
--- /dev/null
+++ b/NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs
@@ -0,0 +1,147 @@
+﻿// Copyright (c) Katsuya Iida.  All Rights Reserved.
+// See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
+// largely located in the files found in this folder:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using System.Data.SqlTypes;
+using System.Globalization;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace NeMoOnnxSharp.TTSTokenizers
+{
+    public static class TokenizerUtils
+    {
+        private static readonly Dictionary<char, char> _synoGlyph2Ascii;
+        private static readonly Regex _wordsReEn;
+
+        static TokenizerUtils()
+        {
+            Tuple<char, char[]>[] _synoglyphs = {
+                new Tuple<char, char[]>('\'', new[] { '’' }),
+                new Tuple<char, char[]>('"', new[] { '”', '“' }),
+            };
+
+            _synoGlyph2Ascii = new Dictionary<char, char>();
+            foreach (var (asc, glyphs) in _synoglyphs)
+            {
+                foreach (var g in glyphs)
+                {
+                    _synoGlyph2Ascii[g] = asc;
+                }
+            }
+
+            // define char set based on https://en.wikipedia.org/wiki/List_of_Unicode_characters
+            var latinAlphabetBasic = "A-Za-z";
+            _wordsReEn = new Regex(@$"([{latinAlphabetBasic}]+(?:[{latinAlphabetBasic}\-']*[{latinAlphabetBasic}]+)*)|(\|[^|]*\|)|([^{latinAlphabetBasic}|]+)");
+        }
+
+        public static string EnglishTextPreprocessing(string text, bool lower = true)
+        {
+            text = new string(
+                text.Normalize(NormalizationForm.FormD)
+                .Where(ch => CharUnicodeInfo.GetUnicodeCategory(ch) != UnicodeCategory.NonSpacingMark)
+                .Select(ch => _synoGlyph2Ascii.ContainsKey(ch) ? _synoGlyph2Ascii[ch] : ch)
+                .ToArray());
+
+            if (lower)
+            {
+                text = text.ToLower();
+            }
+            return text;
+        }
+
+        /// <summary>
+        /// Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation
+        /// can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including
+        /// whitespaces.This function will split unchanged strings by whitespaces and return them as `List[str]`. For example,
+        /// 
+        /// .. code-block::python
+        ///     [
+        ///         ('Hello', '', ''),  # valid word
+        ///         ('', '', ' '),  # punctuation mark
+        ///         ('World', '', ''),  # valid word
+        ///         ('', '', ' '),  # punctuation mark
+        ///         ('', '|NVIDIA unchanged|', ''),  # unchangeable word
+        ///         ('', '', '!')  # punctuation mark
+        ///     ]
+        ///
+        /// will be converted into,
+        ///
+        /// .. code-block::python
+        ///     [
+        ///         (["Hello"], false),
+        ///         ([" "], false),
+        ///         (["World"], false),
+        ///         ([" "], false),
+        ///         (["NVIDIA", "unchanged"], True),
+        ///         (["!"], false)
+        ///     ]
+        /// </summary>
+        /// <param name="words">a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element
+        /// corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`.</param>
+        /// <param name="isLower">a flag to trigger lowercase all words. By default, it is false.</param>
+        /// <returns>a list of tuples like `(a list of words, is_unchanged)`.</returns>
+        private static (string[], bool)[] _wordTokenize(MatchCollection words, bool isLower = false)
+        {
+            var result = new List<(string[], bool)>();
+            foreach (Match word in words)
+            {
+                var maybeWord = word.Groups[0].Value;
+                var maybeWithoutChanges = word.Groups[1].Value;
+                var maybePunct = word.Groups[2].Value;
+
+                var withoutChanges = false;
+                string[] token;
+                if (!string.IsNullOrEmpty(maybeWord))
+                {
+                    if (isLower)
+                    {
+                        token = new[] { maybeWord.ToLower() };
+                    }
+                    else
+                    {
+                        token = new[] { maybeWord };
+                    }
+                }
+                else if (!string.IsNullOrEmpty(maybePunct))
+                {
+                    token = new[] { maybePunct };
+                }
+                else if (!string.IsNullOrEmpty(maybeWithoutChanges))
+                {
+                    withoutChanges = true;
+                    token = maybeWithoutChanges.Substring(1, maybeWithoutChanges.Length - 2).Split(' ');
+                }
+                else
+                {
+                    throw new InvalidDataException(
+                        $"This is not expected. Found empty string: <{word}>. " +
+                        $"Please validate your regular expression pattern '_WORDS_RE_EN' or '_WORDS_RE_ANY_LOCALE'."
+                    );
+                }
+
+                result.Add((token, withoutChanges));
+            }
+            return result.ToArray();
+        }
+
+        public static (string[], bool)[] EnglishWordTokenize(string text)
+        {
+            var words = _wordsReEn.Matches(text);
+            return _wordTokenize(words, isLower: true);
+        }
+    }
+}