improve tts config

2024-04-21 18:19:24 +08:00 · 2024-04-21 18:19:24 +08:00 · 995dff00df
parent 2e64be4b5d
commit 995dff00df
4 changed files with 81 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,7 @@ A streaming digital human based on the Ernerf model， realize audio video synch
 2. 支持大模型对话
 3. 支持多种音频特征驱动：wav2vec、hubert
 4. 支持全身视频拼接
+5. 支持rtmp和webrtc

 ## 1. Installation

@ -58,14 +59,22 @@ export HF_ENDPOINT=https://hf-mirror.com

 用浏览器打开http://serverip:8010/chat.html

-### 3.2 使用本地tts服务,支持声音克隆
+### 3.2 声音克隆
+可以任意选用下面两种服务，推荐用gpt-sovits
+#### 3.2.1 gpt-sovits
+服务部署参照[gpt-sovits](/tts/README.md)  
+运行
+```
+python app.py --tts gpt-sovits --TTS_SERVER http://127.0.0.1:5000 --CHARACTER test --EMOTION default
+```
+#### 3.2.2 xtts
 运行xtts服务，参照 https://github.com/coqui-ai/xtts-streaming-server
 ```
 docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 9000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest
 ```
 然后运行，其中ref.wav为需要克隆的声音文件
 ```
-python app.py --tts xtts --ref_file data/ref.wav
+python app.py --tts xtts --REF_FILE data/ref.wav --TTS_SERVER http://localhost:9000
 ```

 ### 3.3 音频特征用hubert
--- a/app.py
+++ b/app.py
@ -85,13 +85,13 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt

    print("xtts response.elapsed:", res.elapsed)

-def gpt_sovits(text, character, language, server_url, stream_chunk_size) -> Iterator[bytes]:
+def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
    start = time.perf_counter()
    req={}
    req["text"] = text
    req["text_language"] = language
    req["character"] = character
-    #req["emotion"] = emotion
+    req["emotion"] = emotion
    #req["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
    req["stream"] = True
    res = requests.post(
@ -133,10 +133,10 @@ def txt_to_audio(text_):
        stream_tts(
            gpt_sovits(
                text_,
-                "test", #character
+                app.config['CHARACTER'], #"test", #character
                "zh", #en args.language,
-                "http://127.0.0.1:5000", #args.server_url,
-                "20" #args.stream_chunk_size
+                app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
+                app.config['EMOTION'], #emotion 
            ),
            nerfreal
        )
@ -146,7 +146,7 @@ def txt_to_audio(text_):
                text_,
                gspeaker,
                "zh-cn", #en args.language,
-                "http://localhost:9000", #args.server_url,
+                app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
                "20" #args.stream_chunk_size
            ),
            nerfreal
@ -363,17 +363,19 @@ if __name__ == '__main__':
    parser.add_argument('--fullbody_offset_y', type=int, default=0)

    parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
-    parser.add_argument('--ref_file', type=str, default=None)
-    parser.add_argument('--tts_server', type=str, default='http://localhost:9000')
+    parser.add_argument('--REF_FILE', type=str, default=None)
+    parser.add_argument('--TTS_SERVER', type=str, default='http://localhost:9000') #http://127.0.0.1:5000
+    parser.add_argument('--CHARACTER', type=str, default='test')
+    parser.add_argument('--EMOTION', type=str, default='default')

    opt = parser.parse_args()
    app.config.from_object(opt)
-    #print(app.config['tts_server'])
+    print(app.config)

    tts_type = opt.tts
    if tts_type == "xtts":
        print("Computing the latents for a new reference...")
-        gspeaker = get_speaker(opt.ref_file, opt.tts_server)
+        gspeaker = get_speaker(opt.REF_FILE, opt.tts_server)

    # assert test mode
    opt.test = True
--- a/tts/README.md
+++ b/tts/README.md
@ -2,7 +2,7 @@
 ## 部署tts推理
 git clone https://github.com/X-T-E-R/GPT-SoVITS-Inference.git

-1. 安装依赖库
+## 1. 安装依赖库
 ```
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
@ -10,7 +10,7 @@ bash install.sh
 ```
 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型，并将它们放置在 `GPT_SoVITS\pretrained_models` 中

-2. Model Folder Format
+## 2. Model Folder Format
 模型文件下载地址 https://www.yuque.com/xter/zibxlp/gsximn7ditzgispg  
 下载的模型文件放到trained目录下, 如 `trained/Character1/`  
 Put the pth / ckpt / wav files in it, the wav should be named as the prompt text  
@ -24,14 +24,19 @@ trained
 ----hutao said something.wav
 ```

-3. 启动
-后端接口: python Inference/src/tts_backend.py  
-如果有错误提示找不到cmudict，从这下载https://github.com/nltk/nltk_data，将packages改名为nltk_data放到home目录下  
-管理页面: python Inference/src/TTS_Webui.py, 浏览器打开可以管理character和emotion
+## 3. 启动
+### 3.1 后端服务: 
+python Inference/src/tts_backend.py  
+如果有错误提示找不到cmudict，从这下载https://github.com/nltk/nltk_data，将packages改名为nltk_data放到home目录下
+### 3.2 管理character: 
+python Inference/src/Character_Manager.py  
+浏览器打开可以管理character和emotion
+### 3.3 测试tts功能: 
+python Inference/src/TTS_Webui.py  


-4. 接口测试  
-  Character and Emotion List  
+## 4. 接口说明  
+### 4.1 Character and Emotion List  
 To obtain the supported characters and their corresponding emotions, please visit the following URL:
 - URL: `http://127.0.0.1:5000/character_list`
 - Returns: A JSON format list of characters and corresponding emotions
@ -48,4 +53,47 @@ To obtain the supported characters and their corresponding emotions, please visi
        "default"
    ]
 }
-```
+```
+
+### 4.2 Text-to-Speech
+
+- URL: `http://127.0.0.1:5000/tts`
+- Returns:  Audio on success. Error message on failure.
+- Method: `GET`/`POST`
+```
+{
+    "method": "POST",
+    "body": {
+        "character": "${chaName}",
+        "emotion": "${Emotion}",
+        "text": "${speakText}",
+        "text_language": "${textLanguage}",
+        "batch_size": ${batch_size},
+        "speed": ${speed},
+        "top_k": ${topK},
+        "top_p": ${topP},
+        "temperature": ${temperature},
+        "stream": "${stream}",
+        "format": "${Format}",
+        "save_temp": "${saveTemp}"
+    }
+}
+```
+
+##### Parameter Explanation
+
+- **text**: The text to be converted, URL encoding is recommended.
+- **character**: Character folder name, pay attention to case sensitivity, full/half width, and language.
+- **emotion**: Character emotion, must be an actually supported emotion of the character, otherwise, the default emotion will be used.
+- **text_language**: Text language (auto / zh / en / ja), default is multilingual mixed. 
+- **top_k**, **top_p**, **temperature**: GPT model parameters, no need to modify if unfamiliar.
+
+- **batch_size**: How many batches at a time, can be increased for faster processing if you have a powerful computer, integer, default is 1.
+- **speed**: Speech speed, default is 1.0.
+- **save_temp**: Whether to save temporary files, when true, the backend will save the generated audio, and subsequent identical requests will directly return that data, default is false.
+- **stream**: Whether to stream, when true, audio will be returned sentence by sentence, default is false.
+- **format**: Format, default is WAV, allows MP3/ WAV/ OGG.
+
+## 部署tts训练
+https://github.com/RVC-Boss/GPT-SoVITS  
+根据文档说明部署，将训练后的模型拷到推理服务的trained目录下
--- a/tts/tts_test.py
+++ b/tts/tts_test.py
@ -2,9 +2,7 @@ import requests
 import pyaudio

 # 流式传输音频的URL，你可以自由改成Post
-# stream_url = 'http://127.0.0.1:9880/tts?text=这是一段测试文本，旨在通过多种语言风格和复杂性的内容来全面检验文本到语音系统的性能。接下来，我们会探索各种主题和语言结构，包括文学引用、技术性描述、日常会话以及诗歌等。首先，让我们从一段简单的描述性文本开始：“在一个阳光明媚的下午，一位年轻的旅者站在山顶上，眺望着下方那宽广而繁忙的城市。他的心中充满了对未来的憧憬和对旅途的期待。”这段文本测试了系统对自然景观描写的处理能力和情感表达的细腻程度。&stream=true'
-
-stream_url = 'http://127.0.0.1:9880/tts?text=我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。&text_lang=zh&ref_audio_path=mengpai.wav&prompt_lang=zh&prompt_text=呜哇好生气啊！不要把我跟一斗相提并论！&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true'
+stream_url = 'http://127.0.0.1:5000/tts?text=这是一段测试文本，旨在通过多种语言风格和复杂性的内容来全面检验文本到语音系统的性能。接下来，我们会探索各种主题和语言结构，包括文学引用、技术性描述、日常会话以及诗歌等。首先，让我们从一段简单的描述性文本开始：“在一个阳光明媚的下午，一位年轻的旅者站在山顶上，眺望着下方那宽广而繁忙的城市。他的心中充满了对未来的憧憬和对旅途的期待。”这段文本测试了系统对自然景观描写的处理能力和情感表达的细腻程度。&stream=true'

 # 初始化pyaudio
 p = pyaudio.PyAudio()