From f8a57fb61fec684612d5477705280b644c48a291 Mon Sep 17 00:00:00 2001 From: spicysama Date: Fri, 25 Oct 2024 15:11:51 +0800 Subject: [PATCH] Update docs (#638) * Linux pyaudio dependencies * revert generate.py * Better bug report & feat request * Auto-select torchaudio backend * safety * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: manual seed for restore * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Gradio > 5 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix docs and code * Update help docs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/en/finetune.md | 2 +- docs/en/inference.md | 7 ++++- docs/ja/finetune.md | 2 +- docs/ja/inference.md | 54 +++------------------------------------ docs/pt/finetune.md | 2 +- docs/pt/inference.md | 50 +++--------------------------------- docs/zh/finetune.md | 2 +- docs/zh/inference.md | 9 ++++--- tools/api.py | 5 ++-- tools/commons.py | 1 - tools/post_api.py | 44 ++++++++++++++++++++----------- tools/vqgan/extract_vq.py | 5 ++-- 12 files changed, 57 insertions(+), 126 deletions(-) diff --git a/docs/en/finetune.md b/docs/en/finetune.md index a81727aa..8b19a8df 100644 --- a/docs/en/finetune.md +++ b/docs/en/finetune.md @@ -109,7 +109,7 @@ python fish_speech/train.py --config-name text2semantic_finetune \ !!! note For Windows users, you can use `trainer.strategy.process_group_backend=gloo` to avoid `nccl` issues. -After training is complete, you can refer to the [inference](inference.md) section, and use `--speaker SPK1` to generate speech. +After training is complete, you can refer to the [inference](inference.md) section to generate speech. !!! info By default, the model will only learn the speaker's speech patterns and not the timbre. You still need to use prompts to ensure timbre stability. diff --git a/docs/en/inference.md b/docs/en/inference.md index fa91f7bb..316c8a9c 100644 --- a/docs/en/inference.md +++ b/docs/en/inference.md @@ -74,7 +74,7 @@ python -m tools.api \ --decoder-config-name firefly_gan_vq ``` -If you want to speed up inference, you can add the --compile parameter. +> If you want to speed up inference, you can add the `--compile` parameter. After that, you can view and test the API at http://127.0.0.1:8080/. @@ -107,6 +107,10 @@ The above command synthesizes the desired `MP3` format audio based on the inform You can also use `--reference_id` (only one can be used) instead of `--reference-audio` and `--reference_text`, provided that you create a `references/` folder in the project root directory, which contains any audio and annotation text. The currently supported reference audio has a maximum total duration of 90 seconds. + +!!! info + To learn more about available parameters, you can use the command `python -m tools.post_api -h` + ## GUI Inference [Download client](https://github.com/AnyaCoder/fish-speech-gui/releases) @@ -120,6 +124,7 @@ python -m tools.webui \ --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \ --decoder-config-name firefly_gan_vq ``` +> If you want to speed up inference, you can add the `--compile` parameter. !!! note You can save the label file and reference audio file in advance to the `references` folder in the main directory (which you need to create yourself), so that you can directly call them in the WebUI. diff --git a/docs/ja/finetune.md b/docs/ja/finetune.md index 05586b90..68db8cbd 100644 --- a/docs/ja/finetune.md +++ b/docs/ja/finetune.md @@ -109,7 +109,7 @@ python fish_speech/train.py --config-name text2semantic_finetune \ !!! note Windowsユーザーの場合、`trainer.strategy.process_group_backend=gloo` を使用して `nccl` の問題を回避できます。 -トレーニングが完了したら、[推論](inference.md)セクションを参照し、`--speaker SPK1` を使用して音声を生成します。 +トレーニングが完了したら、[推論](inference.md)セクションを参照し、音声を生成します。 !!! info デフォルトでは、モデルは話者の発話パターンのみを学習し、音色は学習しません。音色の安定性を確保するためにプロンプトを使用する必要があります。 diff --git a/docs/ja/inference.md b/docs/ja/inference.md index 4c63c9d7..c4e61450 100644 --- a/docs/ja/inference.md +++ b/docs/ja/inference.md @@ -74,7 +74,7 @@ python -m tools.api \ --decoder-config-name firefly_gan_vq ``` -推論を高速化したい場合は、--compile パラメータを追加できます。 +> 推論を高速化したい場合は、`--compile` パラメータを追加できます。 その後、`http://127.0.0.1:8080/`で API を表示およびテストできます。 @@ -90,55 +90,8 @@ python -m tools.post_api \ 上記のコマンドは、参照音声の情報に基づいて必要な音声を合成し、ストリーミング方式で返すことを示しています。 -`{SPEAKER}`と`{EMOTION}`に基づいて参照音声をランダムに選択する必要がある場合は、以下の手順に従って設定します: - -### 1. プロジェクトのルートディレクトリに`ref_data`フォルダを作成します。 - -### 2. `ref_data`フォルダ内に次のような構造のディレクトリを作成します。 - -``` -. -├── SPEAKER1 -│ ├──EMOTION1 -│ │ ├── 21.15-26.44.lab -│ │ ├── 21.15-26.44.wav -│ │ ├── 27.51-29.98.lab -│ │ ├── 27.51-29.98.wav -│ │ ├── 30.1-32.71.lab -│ │ └── 30.1-32.71.flac -│ └──EMOTION2 -│ ├── 30.1-32.71.lab -│ └── 30.1-32.71.mp3 -└── SPEAKER2 - └─── EMOTION3 - ├── 30.1-32.71.lab - └── 30.1-32.71.mp3 - -``` - -つまり、まず`ref_data`に`{SPEAKER}`フォルダを配置し、各スピーカーの下に`{EMOTION}`フォルダを配置し、各感情フォルダの下に任意の数の音声-テキストペアを配置します - -### 3. 仮想環境で以下のコマンドを入力します. - -```bash -python tools/gen_ref.py - -``` - -参照ディレクトリを生成します。 - -### 4. API を呼び出します。 - -```bash -python -m tools.post_api \ - --text "入力するテキスト" \ - --speaker "${SPEAKER1}" \ - --emotion "${EMOTION1}" \ - --streaming True - -``` - -上記の例はテスト目的のみです。 +!!! info + 使用可能なパラメータの詳細については、コマンド` python -m tools.post_api -h `を使用してください ## WebUI 推論 @@ -150,6 +103,7 @@ python -m tools.webui \ --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \ --decoder-config-name firefly_gan_vq ``` +> 推論を高速化したい場合は、`--compile` パラメータを追加できます。 !!! note ラベルファイルと参照音声ファイルをメインディレクトリの `references` フォルダ(自分で作成する必要があります)に事前に保存しておくことで、WebUI で直接呼び出すことができます。 diff --git a/docs/pt/finetune.md b/docs/pt/finetune.md index a0c0b005..f57d92c7 100644 --- a/docs/pt/finetune.md +++ b/docs/pt/finetune.md @@ -109,7 +109,7 @@ python fish_speech/train.py --config-name text2semantic_finetune \ !!! note Para usuários do Windows, é recomendado usar `trainer.strategy.process_group_backend=gloo` para evitar problemas com `nccl`. -Após concluir o treinamento, consulte a seção [inferência](inference.md), e use `--speaker SPK1` para gerar fala. +Após concluir o treinamento, consulte a seção [inferência](inference.md). !!! info Por padrão, o modelo aprenderá apenas os padrões de fala do orador e não o timbre. Ainda pode ser preciso usar prompts para garantir a estabilidade do timbre. diff --git a/docs/pt/inference.md b/docs/pt/inference.md index e4c3ced5..8cbaa4ee 100644 --- a/docs/pt/inference.md +++ b/docs/pt/inference.md @@ -74,7 +74,7 @@ python -m tools.api \ --decoder-config-name firefly_gan_vq ``` -Para acelerar a inferência, adicione o parâmetro `--compile`. +> Para acelerar a inferência, adicione o parâmetro `--compile`. Depois disso, é possível visualizar e testar a API em http://127.0.0.1:8080/. @@ -90,51 +90,8 @@ python -m tools.post_api \ O comando acima indica a síntese do áudio desejada de acordo com as informações do áudio de referência e a retorna em modo de streaming. -Caso selecione, de forma aleatória, o áudio de referência com base em `{SPEAKER}` e `{EMOTION}`, o configure de acordo com as seguintes etapas: - -### 1. Crie uma pasta `ref_data` no diretório raiz do projeto. - -### 2. Crie uma estrutura de diretórios semelhante à seguinte dentro da pasta `ref_data`. - -``` -. -├── SPEAKER1 -│ ├──EMOTION1 -│ │ ├── 21.15-26.44.lab -│ │ ├── 21.15-26.44.wav -│ │ ├── 27.51-29.98.lab -│ │ ├── 27.51-29.98.wav -│ │ ├── 30.1-32.71.lab -│ │ └── 30.1-32.71.flac -│ └──EMOTION2 -│ ├── 30.1-32.71.lab -│ └── 30.1-32.71.mp3 -└── SPEAKER2 - └─── EMOTION3 - ├── 30.1-32.71.lab - └── 30.1-32.71.mp3 -``` - -Ou seja, primeiro coloque as pastas `{SPEAKER}` em `ref_data`, depois coloque as pastas `{EMOTION}` em cada pasta de orador (speaker) e coloque qualquer número de `pares áudio-texto` em cada pasta de emoção. - -### 3. Digite o seguinte comando no ambiente virtual - -```bash -python tools/gen_ref.py - -``` - -### 4. Chame a API. - -```bash -python -m tools.post_api \ - --text "Texto a ser inserido" \ - --speaker "${SPEAKER1}" \ - --emotion "${EMOTION1}" \ - --streaming True -``` - -O exemplo acima é apenas para fins de teste. +!!! info + Para aprender mais sobre parâmetros disponíveis, você pode usar o comando `python -m tools.post_api -h` ## Inferência por WebUI @@ -146,6 +103,7 @@ python -m tools.webui \ --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \ --decoder-config-name firefly_gan_vq ``` +> Para acelerar a inferência, adicione o parâmetro `--compile`. !!! note Você pode salvar antecipadamente o arquivo de rótulos e o arquivo de áudio de referência na pasta `references` do diretório principal (que você precisa criar), para que possa chamá-los diretamente na WebUI. diff --git a/docs/zh/finetune.md b/docs/zh/finetune.md index a6bcbf85..f7db80c9 100644 --- a/docs/zh/finetune.md +++ b/docs/zh/finetune.md @@ -119,7 +119,7 @@ python fish_speech/train.py --config-name text2semantic_finetune \ !!! note 对于 Windows 用户, 你可以使用 `trainer.strategy.process_group_backend=gloo` 来避免 `nccl` 的问题. -训练结束后, 你可以参考 [推理](inference.md) 部分, 并携带 `--speaker SPK1` 参数来测试你的模型. +训练结束后, 你可以参考 [推理](inference.md) 部分来测试你的模型. !!! info 默认配置下, 基本只会学到说话人的发音方式, 而不包含音色, 你依然需要使用 prompt 来保证音色的稳定性. diff --git a/docs/zh/inference.md b/docs/zh/inference.md index a1434f00..0c679be0 100644 --- a/docs/zh/inference.md +++ b/docs/zh/inference.md @@ -79,7 +79,7 @@ python -m tools.api \ --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \ --decoder-config-name firefly_gan_vq ``` -如果你想要加速推理,可以加上`--compile`参数。 +> 如果你想要加速推理,可以加上`--compile`参数。 推荐中国大陆用户运行以下命令来启动 HTTP 服务: ```bash @@ -100,8 +100,7 @@ python -m tools.post_api \ 上面的命令表示按照参考音频的信息,合成所需的音频并流式返回. -下面的示例展示了, 可以一次使用**多个** `参考音频路径` 和 `参考音频的文本内容`。在命令里用空格隔开即可。 - +下面的示例展示了, 可以一次使用**多个** `参考音频路径` 和 `参考音频的文本内容`。在命令里用空格隔开即可。 ```bash python -m tools.post_api \ --text "要输入的文本" \ @@ -117,6 +116,9 @@ python -m tools.post_api \ 还可以用`--reference_id`(仅能用一个)来代替`--reference_audio`和`--reference_text`, 前提是在项目根目录下创建`references/`文件夹, 里面放上任意对音频与标注文本。 目前支持的参考音频最多加起来总时长90s。 +!!! info + 要了解有关可用参数的更多信息,可以使用命令`python -m tools.post_api -h` + ## GUI 推理 [下载客户端](https://github.com/AnyaCoder/fish-speech-gui/releases) @@ -130,6 +132,7 @@ python -m tools.webui \ --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \ --decoder-config-name firefly_gan_vq ``` +> 如果你想要加速推理,可以加上`--compile`参数。 !!! note 你可以提前将label文件和参考音频文件保存到主目录下的 `references` 文件夹(需要自行创建),这样你可以直接在WebUI中调用它们。 diff --git a/tools/api.py b/tools/api.py index 1d53d269..2c7f1698 100644 --- a/tools/api.py +++ b/tools/api.py @@ -47,9 +47,8 @@ from tools.vqgan.inference import load_model as load_decoder_model backends = torchaudio.list_audio_backends() -if "sox" in backends: - backend = "sox" -elif "ffmpeg" in backends: + +if "ffmpeg" in backends: backend = "ffmpeg" else: backend = "soundfile" diff --git a/tools/commons.py b/tools/commons.py index de70a2c6..2e8ce5a4 100644 --- a/tools/commons.py +++ b/tools/commons.py @@ -30,7 +30,6 @@ class ServeTTSRequest(BaseModel): latency: Literal["normal", "balanced"] = "normal" # not usually used below streaming: bool = False - emotion: Optional[str] = None max_new_tokens: int = 1024 top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7 repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2 diff --git a/tools/post_api.py b/tools/post_api.py index c533d9aa..92fb0c25 100644 --- a/tools/post_api.py +++ b/tools/post_api.py @@ -15,7 +15,8 @@ def parse_args(): parser = argparse.ArgumentParser( - description="Send a WAV file and text to a server and receive synthesized audio." + description="Send a WAV file and text to a server and receive synthesized audio.", + formatter_class=argparse.RawTextHelpFormatter, ) parser.add_argument( @@ -33,7 +34,7 @@ def parse_args(): "-id", type=str, default=None, - help="ID of the reference model to be used for the speech", + help="ID of the reference model to be used for the speech\n(Local: name of folder containing audios and files)", ) parser.add_argument( "--reference_audio", @@ -41,7 +42,7 @@ def parse_args(): type=str, nargs="+", default=None, - help="Path to the WAV file", + help="Path to the audio file", ) parser.add_argument( "--reference_text", @@ -68,17 +69,25 @@ def parse_args(): parser.add_argument( "--format", type=str, choices=["wav", "mp3", "flac"], default="wav" ) - parser.add_argument("--mp3_bitrate", type=int, default=64) + parser.add_argument( + "--mp3_bitrate", type=int, choices=[64, 128, 192], default=64, help="kHz" + ) parser.add_argument("--opus_bitrate", type=int, default=-1000) - parser.add_argument("--latency", type=str, default="normal", help="延迟选项") + parser.add_argument( + "--latency", + type=str, + default="normal", + choices=["normal", "balanced"], + help="Used in api.fish.audio/v1/tts", + ) parser.add_argument( "--max_new_tokens", type=int, - default=1024, - help="Maximum new tokens to generate", + default=0, + help="Maximum new tokens to generate. \n0 means no limit.", ) parser.add_argument( - "--chunk_length", type=int, default=100, help="Chunk length for synthesis" + "--chunk_length", type=int, default=200, help="Chunk length for synthesis" ) parser.add_argument( "--top_p", type=float, default=0.7, help="Top-p sampling for synthesis" @@ -92,10 +101,7 @@ def parse_args(): parser.add_argument( "--temperature", type=float, default=0.7, help="Temperature for sampling" ) - parser.add_argument( - "--speaker", type=str, default=None, help="Speaker ID for voice synthesis" - ) - parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion") + parser.add_argument( "--streaming", type=bool, default=False, help="Enable streaming response" ) @@ -107,7 +113,17 @@ def parse_args(): "--use_memory_cache", type=str, default="never", - help="Cache encoded references codes in memory", + choices=["on-demand", "never"], + help="Cache encoded references codes in memory.\n" + "If `on-demand`, the server will use cached encodings\n " + "instead of encoding reference audio again.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="`None` means randomized inference, otherwise deterministic.\n" + "It can't be used for fixing a timbre.", ) parser.add_argument( "--seed", @@ -157,8 +173,6 @@ def parse_args(): "top_p": args.top_p, "repetition_penalty": args.repetition_penalty, "temperature": args.temperature, - "speaker": args.speaker, - "emotion": args.emotion, "streaming": args.streaming, "use_memory_cache": args.use_memory_cache, "seed": args.seed, diff --git a/tools/vqgan/extract_vq.py b/tools/vqgan/extract_vq.py index 1983219b..11e8e143 100644 --- a/tools/vqgan/extract_vq.py +++ b/tools/vqgan/extract_vq.py @@ -25,9 +25,8 @@ # It's mainly used to generate the training data for the VQ model. backends = torchaudio.list_audio_backends() -if "sox" in backends: - backend = "sox" -elif "ffmpeg" in backends: + +if "ffmpeg" in backends: backend = "ffmpeg" else: backend = "soundfile"