Add English G2P and tokenizer. (#15)

kaiidams · Sep 30, 2023 · 4ad71cb · 4ad71cb
1 parent 28b439b
commit 4ad71cb
Show file tree

Hide file tree

Showing 10 changed files with 808 additions and 0 deletions.
diff --git a/NeMoOnnxSharp.Example/PretrainedModelInfo.cs b/NeMoOnnxSharp.Example/PretrainedModelInfo.cs
@@ -45,6 +45,16 @@ private static PretrainedModelInfo[] CreateModelList()
                     "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/commandrecognition_en_matchboxnet3x1x64_v2.onnx",
                     "a0c5e4d14e83d3b6afdaf239265a390c2ca513bcdedf3d295bc1f9f97f19868a"
                 ),
+                new PretrainedModelInfo(
+                    "cmudict-0.7b_nv22.10",
+                    "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/cmudict-0.7b_nv22.10",
+                    "d330f3a3554d4c7ff8ef7bfc0c338ed74831d5f54109508fb829bdd82173608b"
+                ),
+                new PretrainedModelInfo(
+                    "heteronyms-052722",
+                    "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/heteronyms-052722",
+                    "b701909aedf753172eff223950f8859cd4b9b4c80199cf0a6e9ac4a307c8f8ec"
+                ),
                 new PretrainedModelInfo(
                     "tts_en_hifigan",
                     "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_hifigan.onnx",

diff --git a/NeMoOnnxSharp.Example/Program.cs b/NeMoOnnxSharp.Example/Program.cs
@@ -6,6 +6,7 @@
 using System.Text;
 using System.Threading.Tasks;
 using System.Collections.Generic;
+using NeMoOnnxSharp.TTSTokenizers;
 
 namespace NeMoOnnxSharp.Example
 {
@@ -19,6 +20,10 @@ static async Task Main(string[] args)
             {
                 await Transcribe();
             }
+            else if (task == "speak")
+            {
+                await Speak();
+            }
             else if (task == "vad")
             {
                 await FramePredict(false);
@@ -54,6 +59,23 @@ static async Task Transcribe()
             }
         }
 
+        static async Task Speak()
+        {
+            string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
+            string phoneDict = await DownloadModelAsync("cmudict-0.7b_nv22.10");
+            string heteronyms = await DownloadModelAsync("heteronyms-052722");
+            var g2p = new EnglishG2p(phoneDict, heteronyms);
+            var tokenizer = new EnglishPhonemesTokenizer(
+                g2p,
+                punct: true,
+                stresses: true,
+                chars: true,
+                apostrophe: true,
+                padWithSpace: true,
+                addBlankAt: BaseTokenizer.AddBlankAt.True);
+            tokenizer.Encode("Hello world!");
+        }
+
         static async Task FramePredict(bool mbn)
         {
             string appDirPath = AppDomain.CurrentDomain.BaseDirectory;

diff --git a/NeMoOnnxSharp.Tests/Data/cmudict-test b/NeMoOnnxSharp.Tests/Data/cmudict-test
@@ -0,0 +1,21 @@
+# Comment
+YOU'VE  Y UW1 V
+READ  R EH1 D
+READ(1)  R IY1 D
+BOOK  B UH1 K
+THE  DH AH0
+THE(1)  DH AH1
+THE(2)  DH IY0
+OPERATING  AA1 P ER0 EY2 T IH0 NG
+OPERATING(1)  AO1 P ER0 EY2 T IH0 NG
+SYSTEM  S IH1 S T AH0 M
+DESIGN  D IH0 Z AY1 N
+AND  AH0 N D
+AND(1)  AE1 N D
+IMPLEMENTATION  IH2 M P L AH0 M EH0 N T EY1 SH AH0 N
+THIRD  TH ER1 D
+EDITION  AH0 D IH1 SH AH0 N
+EDITION(1)  IH0 D IH1 SH AH0 N
+DID  D IH1 D
+DID(1)  D IH0 D
+YOU  Y UW1
diff --git a/NeMoOnnxSharp.Tests/Data/heteronyms-test b/NeMoOnnxSharp.Tests/Data/heteronyms-test
@@ -0,0 +1 @@
+read
diff --git a/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj b/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj
@@ -45,6 +45,12 @@
   </ItemGroup>
 
   <ItemGroup>
+    <None Update="Data\cmudict-test">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+    <None Update="Data\heteronyms-test">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
     <None Update="Data\make_test.py">
       <CopyToOutputDirectory>Never</CopyToOutputDirectory>
     </None>

diff --git a/NeMoOnnxSharp.Tests/TextTokenizersTest.cs b/NeMoOnnxSharp.Tests/TextTokenizersTest.cs
@@ -0,0 +1,89 @@
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+using NeMoOnnxSharp.TTSTokenizers;
+using System;
+using System.Diagnostics;
+using System.IO;
+
+namespace NeMoOnnxSharp.Tests
+{
+    [TestClass]
+    public class TextTokenizersTest
+    {
+        private readonly static string[] ExpectedTokens =
+        {
+            " ", "B", "CH", "D", "DH", "F", "G", "HH", "JH", "K", "L", "M",
+            "N", "NG", "P", "R", "S", "SH", "T", "TH", "V", "W", "Y", "Z", "ZH",
+            "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0",
+            "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "EH0", "EH1",
+            "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "IH0", "IH1", "IH2",
+            "IY0", "IY1", "IY2", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "UH0",
+            "UH1", "UH2", "UW0", "UW1", "UW2", "a", "b", "c", "d", "e", "f", "g",
+            "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
+            "v", "w", "x", "y", "z", "'", ",", ".", "!", "?", "-", ":", ";", "/",
+            "\"", "(", ")", "[", "]", "{", "}", "<pad>", "<blank>", "<oov>"
+        };
+
+        private const string SampleText =
+            "You've read the book “Operating Systems Design and Implementation, 3rd edition”. Did you?";
+        private const string NormalizedSampleText =
+            "You've read the book “Operating Systems Design and Implementation, third edition”. Did you?";
+        private const string SamplePronText =
+            "Y|UW1|V| |r|e|a|d| |t|h|e| |B|UH1|K| |“|o|p|e|r|a|t|i|n|g| |"
+            + "S|IH1|S|T|AH0|M|Z| |D|IH0|Z|AY1|N| |a|n|d| |IH2|M|P|L|AH0|"
+            + "M|EH0|N|T|EY1|SH|AH0|N|,| |TH|ER1|D| |e|d|i|t|i|o|n|”|.| |"
+            + "d|i|d| |Y|UW1|?";
+
+        private readonly static int[] SampleParsed =
+        {
+             0,  22,  68,  20,   0,  87,  74,  70,  73,   0,  89,  77,  74,
+             0,   1,  65,   9,   0, 105,  84,  85,  74,  87,  70,  89,  78,
+            83,  76,   0,  16,  53,  16,  18,  31,  11,  23,   0,   3,  52,
+            23,  41,  12,   0,  70,  83,  73,   0,  54,  11,  14,  10,  31,
+            11,  43,  12,  18,  50,  17,  31,  12,  97,   0,  19,  47,   3,
+             0,  74,  73,  78,  89,  78,  84,  83, 105,  98,   0,  73,  78,
+            73,   0,  22,  68, 100,   0
+        };
+
+        [TestInitialize]
+        public void Initialize()
+        {
+            string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
+            _g2p = new EnglishG2p(
+                phonemeDict: Path.Combine(appDirPath, "Data", "cmudict-test"),
+                heteronyms: Path.Combine(appDirPath, "Data", "heteronyms-test"),
+                phonemeProbability: 1.0);
+            _tokenizer = new EnglishPhonemesTokenizer(
+                _g2p,
+                punct: true,
+                stresses: true,
+                chars: true,
+                apostrophe: true,
+                padWithSpace: true,
+                addBlankAt: BaseTokenizer.AddBlankAt.True);
+        }
+
+        [TestMethod]
+        public void TestTokenizerVocab()
+        {
+            CollectionAssert.AreEquivalent(ExpectedTokens, _tokenizer.Tokens);
+        }
+
+        [TestMethod]
+        public void TestEnglishG2p()
+        {
+            var pron = string.Join("|", _g2p.Parse(NormalizedSampleText));
+            Assert.AreEqual(SamplePronText, pron);
+        }
+
+        [TestMethod]
+        public void TestEnglishEncode()
+        {
+            var parsed = _tokenizer.Encode(NormalizedSampleText);
+            CollectionAssert.AreEquivalent(SampleParsed, parsed);
+        }
+
+        private EnglishG2p? _g2p;
+        private EnglishPhonemesTokenizer? _tokenizer;
+
+    }
+}
diff --git a/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs b/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs
@@ -0,0 +1,75 @@
+// Copyright (c) Katsuya Iida.  All Rights Reserved.
+// See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
+// largely located in the files found in this folder:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/NVIDIA/NeMo/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace NeMoOnnxSharp.TTSTokenizers
+{
+    public abstract class BaseTokenizer
+    {
+        public enum AddBlankAt
+        {
+            False,
+            True,
+            Last
+        }
+
+        protected const string Pad = "<pad>";
+        protected const string Blank = "<blank>";
+        protected const string OOV = "<oov>";
+
+        protected BaseTokenizer()
+        {
+            _sep = string.Empty;
+            _id2token = Array.Empty<string>();
+            _token2id = new Dictionary<string, int>();
+            _utilIds = new HashSet<int>();
+        }
+
+        /// <summary>
+        /// Turns str text into int tokens.
+        /// </summary>
+        public abstract int[] Encode(string text);
+
+        /// <summary>
+        /// Turns ints tokens into str text.
+        /// </summary>
+        public string Decode(int[] tokens)
+        {
+            return string.Join(
+                _sep,
+                tokens
+                .Where(t => !_utilIds.Contains(t))
+                .Select(t => _id2token[t]));
+        }
+
+        public string[] Tokens { get { return _id2token; } }
+        public int PadId { get { return _pad; } }
+        public int BlankId { get { return _blank; } }
+        public int OOVId { get { return _oov; } }
+        public string Sep { get { return _sep; } }
+
+        protected string[] _id2token;
+        protected IDictionary<string, int> _token2id;
+        protected ISet<int> _utilIds;
+        protected int _space;
+        protected int _pad;
+        protected int _blank;
+        protected int _oov;
+        protected string _sep;
+        protected bool _padWithSpace;
+    }
+}