improve tts config
This commit is contained in:
parent
2e64be4b5d
commit
995dff00df
13
README.md
13
README.md
|
@ -8,6 +8,7 @@ A streaming digital human based on the Ernerf model, realize audio video synch
|
||||||
2. 支持大模型对话
|
2. 支持大模型对话
|
||||||
3. 支持多种音频特征驱动:wav2vec、hubert
|
3. 支持多种音频特征驱动:wav2vec、hubert
|
||||||
4. 支持全身视频拼接
|
4. 支持全身视频拼接
|
||||||
|
5. 支持rtmp和webrtc
|
||||||
|
|
||||||
## 1. Installation
|
## 1. Installation
|
||||||
|
|
||||||
|
@ -58,14 +59,22 @@ export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
用浏览器打开http://serverip:8010/chat.html
|
用浏览器打开http://serverip:8010/chat.html
|
||||||
|
|
||||||
### 3.2 使用本地tts服务,支持声音克隆
|
### 3.2 声音克隆
|
||||||
|
可以任意选用下面两种服务,推荐用gpt-sovits
|
||||||
|
#### 3.2.1 gpt-sovits
|
||||||
|
服务部署参照[gpt-sovits](/tts/README.md)
|
||||||
|
运行
|
||||||
|
```
|
||||||
|
python app.py --tts gpt-sovits --TTS_SERVER http://127.0.0.1:5000 --CHARACTER test --EMOTION default
|
||||||
|
```
|
||||||
|
#### 3.2.2 xtts
|
||||||
运行xtts服务,参照 https://github.com/coqui-ai/xtts-streaming-server
|
运行xtts服务,参照 https://github.com/coqui-ai/xtts-streaming-server
|
||||||
```
|
```
|
||||||
docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 9000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest
|
docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 9000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest
|
||||||
```
|
```
|
||||||
然后运行,其中ref.wav为需要克隆的声音文件
|
然后运行,其中ref.wav为需要克隆的声音文件
|
||||||
```
|
```
|
||||||
python app.py --tts xtts --ref_file data/ref.wav
|
python app.py --tts xtts --REF_FILE data/ref.wav --TTS_SERVER http://localhost:9000
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3.3 音频特征用hubert
|
### 3.3 音频特征用hubert
|
||||||
|
|
22
app.py
22
app.py
|
@ -85,13 +85,13 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt
|
||||||
|
|
||||||
print("xtts response.elapsed:", res.elapsed)
|
print("xtts response.elapsed:", res.elapsed)
|
||||||
|
|
||||||
def gpt_sovits(text, character, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
req={}
|
req={}
|
||||||
req["text"] = text
|
req["text"] = text
|
||||||
req["text_language"] = language
|
req["text_language"] = language
|
||||||
req["character"] = character
|
req["character"] = character
|
||||||
#req["emotion"] = emotion
|
req["emotion"] = emotion
|
||||||
#req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
#req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||||
req["stream"] = True
|
req["stream"] = True
|
||||||
res = requests.post(
|
res = requests.post(
|
||||||
|
@ -133,10 +133,10 @@ def txt_to_audio(text_):
|
||||||
stream_tts(
|
stream_tts(
|
||||||
gpt_sovits(
|
gpt_sovits(
|
||||||
text_,
|
text_,
|
||||||
"test", #character
|
app.config['CHARACTER'], #"test", #character
|
||||||
"zh", #en args.language,
|
"zh", #en args.language,
|
||||||
"http://127.0.0.1:5000", #args.server_url,
|
app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
|
||||||
"20" #args.stream_chunk_size
|
app.config['EMOTION'], #emotion
|
||||||
),
|
),
|
||||||
nerfreal
|
nerfreal
|
||||||
)
|
)
|
||||||
|
@ -146,7 +146,7 @@ def txt_to_audio(text_):
|
||||||
text_,
|
text_,
|
||||||
gspeaker,
|
gspeaker,
|
||||||
"zh-cn", #en args.language,
|
"zh-cn", #en args.language,
|
||||||
"http://localhost:9000", #args.server_url,
|
app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
|
||||||
"20" #args.stream_chunk_size
|
"20" #args.stream_chunk_size
|
||||||
),
|
),
|
||||||
nerfreal
|
nerfreal
|
||||||
|
@ -363,17 +363,19 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
||||||
|
|
||||||
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
||||||
parser.add_argument('--ref_file', type=str, default=None)
|
parser.add_argument('--REF_FILE', type=str, default=None)
|
||||||
parser.add_argument('--tts_server', type=str, default='http://localhost:9000')
|
parser.add_argument('--TTS_SERVER', type=str, default='http://localhost:9000') #http://127.0.0.1:5000
|
||||||
|
parser.add_argument('--CHARACTER', type=str, default='test')
|
||||||
|
parser.add_argument('--EMOTION', type=str, default='default')
|
||||||
|
|
||||||
opt = parser.parse_args()
|
opt = parser.parse_args()
|
||||||
app.config.from_object(opt)
|
app.config.from_object(opt)
|
||||||
#print(app.config['tts_server'])
|
print(app.config)
|
||||||
|
|
||||||
tts_type = opt.tts
|
tts_type = opt.tts
|
||||||
if tts_type == "xtts":
|
if tts_type == "xtts":
|
||||||
print("Computing the latents for a new reference...")
|
print("Computing the latents for a new reference...")
|
||||||
gspeaker = get_speaker(opt.ref_file, opt.tts_server)
|
gspeaker = get_speaker(opt.REF_FILE, opt.tts_server)
|
||||||
|
|
||||||
# assert test mode
|
# assert test mode
|
||||||
opt.test = True
|
opt.test = True
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
## 部署tts推理
|
## 部署tts推理
|
||||||
git clone https://github.com/X-T-E-R/GPT-SoVITS-Inference.git
|
git clone https://github.com/X-T-E-R/GPT-SoVITS-Inference.git
|
||||||
|
|
||||||
1. 安装依赖库
|
## 1. 安装依赖库
|
||||||
```
|
```
|
||||||
conda create -n GPTSoVits python=3.9
|
conda create -n GPTSoVits python=3.9
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVits
|
||||||
|
@ -10,7 +10,7 @@ bash install.sh
|
||||||
```
|
```
|
||||||
从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS\pretrained_models` 中
|
从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS\pretrained_models` 中
|
||||||
|
|
||||||
2. Model Folder Format
|
## 2. Model Folder Format
|
||||||
模型文件下载地址 https://www.yuque.com/xter/zibxlp/gsximn7ditzgispg
|
模型文件下载地址 https://www.yuque.com/xter/zibxlp/gsximn7ditzgispg
|
||||||
下载的模型文件放到trained目录下, 如 `trained/Character1/`
|
下载的模型文件放到trained目录下, 如 `trained/Character1/`
|
||||||
Put the pth / ckpt / wav files in it, the wav should be named as the prompt text
|
Put the pth / ckpt / wav files in it, the wav should be named as the prompt text
|
||||||
|
@ -24,14 +24,19 @@ trained
|
||||||
----hutao said something.wav
|
----hutao said something.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
3. 启动
|
## 3. 启动
|
||||||
后端接口: python Inference/src/tts_backend.py
|
### 3.1 后端服务:
|
||||||
如果有错误提示找不到cmudict,从这下载https://github.com/nltk/nltk_data,将packages改名为nltk_data放到home目录下
|
python Inference/src/tts_backend.py
|
||||||
管理页面: python Inference/src/TTS_Webui.py, 浏览器打开可以管理character和emotion
|
如果有错误提示找不到cmudict,从这下载https://github.com/nltk/nltk_data,将packages改名为nltk_data放到home目录下
|
||||||
|
### 3.2 管理character:
|
||||||
|
python Inference/src/Character_Manager.py
|
||||||
|
浏览器打开可以管理character和emotion
|
||||||
|
### 3.3 测试tts功能:
|
||||||
|
python Inference/src/TTS_Webui.py
|
||||||
|
|
||||||
|
|
||||||
4. 接口测试
|
## 4. 接口说明
|
||||||
Character and Emotion List
|
### 4.1 Character and Emotion List
|
||||||
To obtain the supported characters and their corresponding emotions, please visit the following URL:
|
To obtain the supported characters and their corresponding emotions, please visit the following URL:
|
||||||
- URL: `http://127.0.0.1:5000/character_list`
|
- URL: `http://127.0.0.1:5000/character_list`
|
||||||
- Returns: A JSON format list of characters and corresponding emotions
|
- Returns: A JSON format list of characters and corresponding emotions
|
||||||
|
@ -48,4 +53,47 @@ To obtain the supported characters and their corresponding emotions, please visi
|
||||||
"default"
|
"default"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 4.2 Text-to-Speech
|
||||||
|
|
||||||
|
- URL: `http://127.0.0.1:5000/tts`
|
||||||
|
- Returns: Audio on success. Error message on failure.
|
||||||
|
- Method: `GET`/`POST`
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"method": "POST",
|
||||||
|
"body": {
|
||||||
|
"character": "${chaName}",
|
||||||
|
"emotion": "${Emotion}",
|
||||||
|
"text": "${speakText}",
|
||||||
|
"text_language": "${textLanguage}",
|
||||||
|
"batch_size": ${batch_size},
|
||||||
|
"speed": ${speed},
|
||||||
|
"top_k": ${topK},
|
||||||
|
"top_p": ${topP},
|
||||||
|
"temperature": ${temperature},
|
||||||
|
"stream": "${stream}",
|
||||||
|
"format": "${Format}",
|
||||||
|
"save_temp": "${saveTemp}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Parameter Explanation
|
||||||
|
|
||||||
|
- **text**: The text to be converted, URL encoding is recommended.
|
||||||
|
- **character**: Character folder name, pay attention to case sensitivity, full/half width, and language.
|
||||||
|
- **emotion**: Character emotion, must be an actually supported emotion of the character, otherwise, the default emotion will be used.
|
||||||
|
- **text_language**: Text language (auto / zh / en / ja), default is multilingual mixed.
|
||||||
|
- **top_k**, **top_p**, **temperature**: GPT model parameters, no need to modify if unfamiliar.
|
||||||
|
|
||||||
|
- **batch_size**: How many batches at a time, can be increased for faster processing if you have a powerful computer, integer, default is 1.
|
||||||
|
- **speed**: Speech speed, default is 1.0.
|
||||||
|
- **save_temp**: Whether to save temporary files, when true, the backend will save the generated audio, and subsequent identical requests will directly return that data, default is false.
|
||||||
|
- **stream**: Whether to stream, when true, audio will be returned sentence by sentence, default is false.
|
||||||
|
- **format**: Format, default is WAV, allows MP3/ WAV/ OGG.
|
||||||
|
|
||||||
|
## 部署tts训练
|
||||||
|
https://github.com/RVC-Boss/GPT-SoVITS
|
||||||
|
根据文档说明部署,将训练后的模型拷到推理服务的trained目录下
|
|
@ -2,9 +2,7 @@ import requests
|
||||||
import pyaudio
|
import pyaudio
|
||||||
|
|
||||||
# 流式传输音频的URL,你可以自由改成Post
|
# 流式传输音频的URL,你可以自由改成Post
|
||||||
# stream_url = 'http://127.0.0.1:9880/tts?text=这是一段测试文本,旨在通过多种语言风格和复杂性的内容来全面检验文本到语音系统的性能。接下来,我们会探索各种主题和语言结构,包括文学引用、技术性描述、日常会话以及诗歌等。首先,让我们从一段简单的描述性文本开始:“在一个阳光明媚的下午,一位年轻的旅者站在山顶上,眺望着下方那宽广而繁忙的城市。他的心中充满了对未来的憧憬和对旅途的期待。”这段文本测试了系统对自然景观描写的处理能力和情感表达的细腻程度。&stream=true'
|
stream_url = 'http://127.0.0.1:5000/tts?text=这是一段测试文本,旨在通过多种语言风格和复杂性的内容来全面检验文本到语音系统的性能。接下来,我们会探索各种主题和语言结构,包括文学引用、技术性描述、日常会话以及诗歌等。首先,让我们从一段简单的描述性文本开始:“在一个阳光明媚的下午,一位年轻的旅者站在山顶上,眺望着下方那宽广而繁忙的城市。他的心中充满了对未来的憧憬和对旅途的期待。”这段文本测试了系统对自然景观描写的处理能力和情感表达的细腻程度。&stream=true'
|
||||||
|
|
||||||
stream_url = 'http://127.0.0.1:9880/tts?text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。&text_lang=zh&ref_audio_path=mengpai.wav&prompt_lang=zh&prompt_text=呜哇好生气啊!不要把我跟一斗相提并论!&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true'
|
|
||||||
|
|
||||||
# 初始化pyaudio
|
# 初始化pyaudio
|
||||||
p = pyaudio.PyAudio()
|
p = pyaudio.PyAudio()
|
||||||
|
|
Loading…
Reference in New Issue