Use clap to achieve prompt controlled generation (#223)

* 快速分类音频并把yml格式结果存在训练根目录里 (#190) * Add files via upload * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Update models.py * Update webui.py * Update infer.py * Create compress_model.py * 重新提交，更新Gradio推理UI (#193) * Update webui.py * Update webui.py * 更新 train_ms.py * 更新 models.py * 更新 models.py * 更新 models.py * 更新 train_ms.py * 更新 train_ms.py * 更新 models.py * Update preprocess_text.py * Update config.json * Update train_ms.py * Update webui.py (#206) * Add files via upload (#209) * Update train_ms.py * Update train_ms.py * Update preprocess_text.py * Update train_ms.py * fix (#211) * Update emotion_clustering.py * Add files via upload * Update emotion_clustering.py * add cluster center save * Add files via upload * Update config.py * Update default_config.yml * Update config.py * Update config.py * Update emotion_clustering.py * Update emotion_clustering.py * Update config.py * Update emotion_clustering.py * Update emotion_clustering.py * Update webui.py * Update emotion_clustering.py * Update commons.py * Update emotion_clustering.py * Update webui.py * Update webui.py * Add files via upload * Update train_ms.py * Update train_ms.py * Update train_ms.py * Update train_ms.py * Update train_ms.py * Update webui.py * Update emotion_clustering.py * Update emotion_clustering.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix default_config.yml. * Update infer.py * feat: support infer 2.1 models * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: support infer 2.1 models 兼容bug修复 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update train_ms.py * Add CLAP * Fix data loader * Fix infer.py * Fix webui.py * Add prompt template * Update clap_gen.py * Fix wrong environ value * Add g for dur disc * Update clap_gen.py * Fix multilang generation * Update config.json * Prompt mode * Improve slice segments performance * Add preprocess webui * Update webui_preprocess.py * Update webui_preprocess.py * Update config.py * Update default_config.yml * Update config.py * Update clap_gen.py * Delete emo_gen.py * Delete get_emo.py * Delete emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim directory * Update README.md * Update README * Split val per lang * Delete emotion_clustering.py * Update default_config.yml * Update default_config.yml * Update config.py * Update preprocess_text.py * Update webui_preprocess.py * Update defalut_config.yml * Update webui_preprocess.py * Update preprocess_text.py * Random augmentation for CLAP * Update data_utils.py * Update preprocess_text.py * Add vq for CLAP features to avoid overfitting * Random dummy inputs * Update webui.py * Update models.py * Update infer.py * Apply Code Formatter Change * Update config.json * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: YYuX-1145 <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sora <[email protected]> Co-authored-by: Sihan Wang <[email protected]> Co-authored-by: Stardust-minus <[email protected]>
fishaudio · Dec 12, 2023 · eaefc57 · eaefc57
1 parent 9cc786d
commit eaefc57
Show file tree

Hide file tree

Showing 49 changed files with 287,404 additions and 3,426 deletions.
diff --git a/README.md b/README.md
@@ -5,6 +5,11 @@
 # Bert-VITS2
 
 VITS2 Backbone with multilingual bert
+
+For quick guide, please refer to `webui_preprocess.py`.
+
+简易教程请参见 `webui_preprocess.py`。
+
 ## 请注意，本项目核心思路来源于[anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 一个非常好的tts项目
 ## MassTTS的演示demo为[ai版峰哥锐评峰哥本人,并找回了在金三角失落的腰子](https://www.bilibili.com/video/BV1w24y1c7z9)
 

diff --git a/clap_gen.py b/clap_gen.py
@@ -0,0 +1,64 @@
+import argparse
+from multiprocessing import Pool, cpu_count
+
+import torch
+import torch.multiprocessing as mp
+from tqdm import tqdm
+
+import utils
+from config import config
+from clap_wrapper import get_clap_audio_feature
+import librosa
+import os
+
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+
+
+def process_line(line):
+    device = config.emo_gen_config.device
+    if config.emo_gen_config.use_multi_device:
+        rank = mp.current_process()._identity
+        rank = rank[0] if len(rank) > 0 else 0
+        if torch.cuda.is_available():
+            gpu_id = rank % torch.cuda.device_count()
+            device = torch.device(f"cuda:{gpu_id}")
+        else:
+            device = torch.device("cpu")
+    wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
+
+    clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
+    if os.path.isfile(clap_path):
+        return
+
+    audio = librosa.load(wav_path, 48000)[0]
+    # audio = librosa.resample(audio, 44100, 48000)
+
+    clap = get_clap_audio_feature(audio, device)
+    torch.save(clap, clap_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c", "--config", type=str, default=config.emo_gen_config.config_path
+    )
+    parser.add_argument(
+        "--num_processes", type=int, default=config.emo_gen_config.num_processes
+    )
+    args, _ = parser.parse_known_args()
+    config_path = args.config
+    hps = utils.get_hparams_from_file(config_path)
+    lines = []
+    with open(hps.data.training_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+
+    with open(hps.data.validation_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    if len(lines) != 0:
+        num_processes = min(args.num_processes, cpu_count())
+        with Pool(processes=num_processes) as pool:
+            for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
+                pass
+
+    print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")
diff --git a/clap_wrapper.py b/clap_wrapper.py
@@ -0,0 +1,49 @@
+import sys
+
+import torch
+from transformers import ClapModel, ClapProcessor
+
+from config import config
+
+models = dict()
+processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
+
+
+def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(
+            audios=audio_data, return_tensors="pt", sampling_rate=48000
+        ).to(device)
+        emb = models[device].get_audio_features(**inputs)
+    return emb.T
+
+
+def get_clap_text_feature(text, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(text=text, return_tensors="pt").to(device)
+        emb = models[device].get_text_features(**inputs)
+    return emb.T
diff --git a/commons.py b/commons.py
@@ -46,26 +46,18 @@ def rand_gumbel_like(x):
 
 
 def slice_segments(x, ids_str, segment_size=4):
-    ret = torch.zeros_like(x[:, :, :segment_size])
-    for i in range(x.size(0)):
-        idx_str = ids_str[i]
-        idx_end = idx_str + segment_size
-        if idx_str < 0:
-            i1 = x.size(2) + idx_str
-            r1 = x[i, :, i1:]
-            r2 = x[i, :, :idx_end]
-            ret[i] = torch.cat([r1, r2], dim=1)
-        else:
-            ret[i] = x[i, :, idx_str:idx_end]
-    return ret
+    gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
+        1, x.size(1), 1
+    ) + torch.arange(segment_size, device=x.device)
+    return torch.gather(x, 2, gather_indices)
 
 
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
     b, d, t = x.size()
     if x_lengths is None:
         x_lengths = t
-    ids_str_max = x_lengths - segment_size + 1
-    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
+    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
     ret = slice_segments(x, ids_str, segment_size)
     return ret, ids_str
 

diff --git a/compress_model.py b/compress_model.py
@@ -1,6 +1,7 @@
 from collections import OrderedDict
 from text.symbols import symbols
 import torch
+
 from tools.log import logger
 import utils
 from models import SynthesizerTrn