fix gpt-sovits
This commit is contained in:
parent
d01860176e
commit
58e763fdb6
|
@ -70,8 +70,10 @@ export HF_ENDPOINT=https://hf-mirror.com
|
||||||
服务部署参照[gpt-sovits](/tts/README.md)
|
服务部署参照[gpt-sovits](/tts/README.md)
|
||||||
运行
|
运行
|
||||||
```
|
```
|
||||||
python app.py --tts gpt-sovits --TTS_SERVER http://127.0.0.1:5000 --CHARACTER test --EMOTION default
|
python app.py --tts gpt-sovits --TTS_SERVER http://127.0.0.1:9880 --REF_FILE data/ref.wav --REF_TEXT xxx
|
||||||
```
|
```
|
||||||
|
REF_TEXT为REF_FILE中语音内容,时长不宜过长
|
||||||
|
|
||||||
#### 3.2.2 xtts
|
#### 3.2.2 xtts
|
||||||
运行xtts服务,参照 https://github.com/coqui-ai/xtts-streaming-server
|
运行xtts服务,参照 https://github.com/coqui-ai/xtts-streaming-server
|
||||||
```
|
```
|
||||||
|
|
7
app.py
7
app.py
|
@ -290,9 +290,10 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
||||||
parser.add_argument('--REF_FILE', type=str, default=None)
|
parser.add_argument('--REF_FILE', type=str, default=None)
|
||||||
parser.add_argument('--TTS_SERVER', type=str, default='http://localhost:9000') #http://127.0.0.1:5000
|
parser.add_argument('--REF_TEXT', type=str, default=None)
|
||||||
parser.add_argument('--CHARACTER', type=str, default='test')
|
parser.add_argument('--TTS_SERVER', type=str, default='http://127.0.0.1:9880') # http://localhost:9000
|
||||||
parser.add_argument('--EMOTION', type=str, default='default')
|
# parser.add_argument('--CHARACTER', type=str, default='test')
|
||||||
|
# parser.add_argument('--EMOTION', type=str, default='default')
|
||||||
|
|
||||||
parser.add_argument('--model', type=str, default='ernerf') #musetalk
|
parser.add_argument('--model', type=str, default='ernerf') #musetalk
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,139 @@
|
||||||
|
# 采用gpt-sovits方案,bert-sovits适合长音频训练,gpt-sovits运行短音频快速推理
|
||||||
|
## 部署tts推理
|
||||||
|
git clone https://github.com/X-T-E-R/GPT-SoVITS-Inference.git
|
||||||
|
|
||||||
|
## 1. 安装依赖库
|
||||||
|
```
|
||||||
|
conda create -n GPTSoVits python=3.9
|
||||||
|
conda activate GPTSoVits
|
||||||
|
bash install.sh
|
||||||
|
```
|
||||||
|
从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS\pretrained_models` 中
|
||||||
|
|
||||||
|
注意
|
||||||
|
```
|
||||||
|
是将 GPT-SoVITS 的模型文件放入 pretrained_models目录中
|
||||||
|
```
|
||||||
|
如下
|
||||||
|
```
|
||||||
|
pretrained_models/
|
||||||
|
--chinese-hubert-base
|
||||||
|
--chinese-roberta-wwm-ext-large
|
||||||
|
s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||||
|
s2D488k.pth
|
||||||
|
s2G488k.pth
|
||||||
|
```
|
||||||
|
|
||||||
|
## 2. Model Folder Format
|
||||||
|
模型文件下载地址 https://www.yuque.com/xter/zibxlp/gsximn7ditzgispg
|
||||||
|
下载的模型文件放到trained目录下, 如 `trained/Character1/`
|
||||||
|
Put the pth / ckpt / wav files in it, the wav should be named as the prompt text
|
||||||
|
Like :
|
||||||
|
|
||||||
|
```
|
||||||
|
trained
|
||||||
|
--hutao
|
||||||
|
----hutao-e75.ckpt
|
||||||
|
----hutao_e60_s3360.pth
|
||||||
|
----hutao said something.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. 启动
|
||||||
|
### 3.1 启动webui界面
|
||||||
|
python webuis/character_manager/webui.py
|
||||||
|
可以设置上传的模型数据
|
||||||
|
### 3.2 启动api服务:
|
||||||
|
python app.py
|
||||||
|
|
||||||
|
如果有错误提示找不到cmudict,从这下载https://github.com/nltk/nltk_data,将packages改名为nltk_data放到home目录下
|
||||||
|
### 3.3 tts测试
|
||||||
|
访问 http://127.0.0.1:5000 地址即可测试
|
||||||
|
|
||||||
|
### 3.4 api测试
|
||||||
|
访问 http://127.0.0.1:5000/character_list 查看是否正常
|
||||||
|
|
||||||
|
## 4. 接口说明
|
||||||
|
### 4.1 Character and Emotion List
|
||||||
|
To obtain the supported characters and their corresponding emotions, please visit the following URL:
|
||||||
|
- URL: `http://127.0.0.1:5000/character_list`
|
||||||
|
- Returns: A JSON format list of characters and corresponding emotions
|
||||||
|
- Method: `GET`
|
||||||
|
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"Hanabi": [
|
||||||
|
"default",
|
||||||
|
"Normal",
|
||||||
|
"Yandere",
|
||||||
|
],
|
||||||
|
"Hutao": [
|
||||||
|
"default"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 Text-to-Speech
|
||||||
|
|
||||||
|
- URL: `http://127.0.0.1:5000/tts`
|
||||||
|
- Returns: Audio on success. Error message on failure.
|
||||||
|
- Method: `GET`/`POST`
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"method": "POST",
|
||||||
|
"body": {
|
||||||
|
"character": "${chaName}",
|
||||||
|
"emotion": "${Emotion}",
|
||||||
|
"text": "${speakText}",
|
||||||
|
"text_language": "${textLanguage}",
|
||||||
|
"batch_size": ${batch_size},
|
||||||
|
"speed": ${speed},
|
||||||
|
"top_k": ${topK},
|
||||||
|
"top_p": ${topP},
|
||||||
|
"temperature": ${temperature},
|
||||||
|
"stream": "${stream}",
|
||||||
|
"format": "${Format}",
|
||||||
|
"save_temp": "${saveTemp}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Parameter Explanation
|
||||||
|
|
||||||
|
- **text**: The text to be converted, URL encoding is recommended.
|
||||||
|
- **character**: Character folder name, pay attention to case sensitivity, full/half width, and language.
|
||||||
|
- **emotion**: Character emotion, must be an actually supported emotion of the character, otherwise, the default emotion will be used.
|
||||||
|
- **text_language**: Text language (auto / zh / en / ja), default is multilingual mixed.
|
||||||
|
- **top_k**, **top_p**, **temperature**: GPT model parameters, no need to modify if unfamiliar.
|
||||||
|
|
||||||
|
- **batch_size**: How many batches at a time, can be increased for faster processing if you have a powerful computer, integer, default is 1.
|
||||||
|
- **speed**: Speech speed, default is 1.0.
|
||||||
|
- **save_temp**: Whether to save temporary files, when true, the backend will save the generated audio, and subsequent identical requests will directly return that data, default is false.
|
||||||
|
- **stream**: Whether to stream, when true, audio will be returned sentence by sentence, default is false.
|
||||||
|
- **format**: Format, default is WAV, allows MP3/ WAV/ OGG.
|
||||||
|
|
||||||
|
## 部署tts训练
|
||||||
|
https://github.com/RVC-Boss/GPT-SoVITS
|
||||||
|
根据文档说明部署,将训练后的模型拷到推理服务的trained目录下
|
||||||
|
|
||||||
|
## 如果你需要使用autodl 进行部署
|
||||||
|
请使用 https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS 作为基础镜像你能快速进行部署
|
||||||
|
### 下载
|
||||||
|
```
|
||||||
|
https://github.com/X-T-E-R/GPT-SoVITS-Inference
|
||||||
|
```
|
||||||
|
### 安装
|
||||||
|
```
|
||||||
|
cd GPT-SoVITS-Inference
|
||||||
|
pip3 install -r requirements.txt
|
||||||
|
cp -r GPT_SoVITS/pretrained_models/ ./GPT_SoVITS/pretrained_models
|
||||||
|
```
|
||||||
|
|
||||||
|
### 启动api
|
||||||
|
```
|
||||||
|
python3 app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### 启动webui
|
||||||
|
```
|
||||||
|
python3 webuis/character_manager/webui.py
|
||||||
|
```
|
164
tts/README.md
164
tts/README.md
|
@ -1,14 +1,14 @@
|
||||||
# 采用gpt-sovits方案,bert-sovits适合长音频训练,gpt-sovits运行短音频快速推理
|
# 采用gpt-sovits方案,bert-sovits适合长音频训练,gpt-sovits运行短音频快速推理
|
||||||
## 部署tts推理
|
## 部署tts推理
|
||||||
git clone https://github.com/X-T-E-R/GPT-SoVITS-Inference.git
|
git clone https://github.com/RVC-Boss/GPT-SoVITS.git
|
||||||
|
git checkout fast_inference_
|
||||||
## 1. 安装依赖库
|
## 1. 安装依赖库
|
||||||
```
|
```
|
||||||
conda create -n GPTSoVits python=3.9
|
conda create -n GPTSoVits python=3.9
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVits
|
||||||
bash install.sh
|
bash install.sh
|
||||||
```
|
```
|
||||||
从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS\pretrained_models` 中
|
从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS/GPT_SoVITS/pretrained_models` 中
|
||||||
|
|
||||||
注意
|
注意
|
||||||
```
|
```
|
||||||
|
@ -24,116 +24,80 @@ s2D488k.pth
|
||||||
s2G488k.pth
|
s2G488k.pth
|
||||||
```
|
```
|
||||||
|
|
||||||
## 2. Model Folder Format
|
|
||||||
模型文件下载地址 https://www.yuque.com/xter/zibxlp/gsximn7ditzgispg
|
|
||||||
下载的模型文件放到trained目录下, 如 `trained/Character1/`
|
|
||||||
Put the pth / ckpt / wav files in it, the wav should be named as the prompt text
|
|
||||||
Like :
|
|
||||||
|
|
||||||
```
|
|
||||||
trained
|
|
||||||
--hutao
|
|
||||||
----hutao-e75.ckpt
|
|
||||||
----hutao_e60_s3360.pth
|
|
||||||
----hutao said something.wav
|
|
||||||
```
|
|
||||||
|
|
||||||
## 3. 启动
|
## 3. 启动
|
||||||
### 3.1 启动webui界面
|
### 3.1 启动webui界面(测试效果用)
|
||||||
python webuis/character_manager/webui.py
|
python GPT_SoVITS/inference_webui.py
|
||||||
可以设置上传的模型数据
|
|
||||||
### 3.2 启动api服务:
|
### 3.2 启动api服务:
|
||||||
python app.py
|
python api_v3.py
|
||||||
|
|
||||||
如果有错误提示找不到cmudict,从这下载https://github.com/nltk/nltk_data,将packages改名为nltk_data放到home目录下
|
|
||||||
### 3.3 tts测试
|
|
||||||
访问 http://127.0.0.1:5000 地址即可测试
|
|
||||||
|
|
||||||
### 3.4 api测试
|
|
||||||
访问 http://127.0.0.1:5000/character_list 查看是否正常
|
|
||||||
|
|
||||||
## 4. 接口说明
|
## 4. 接口说明
|
||||||
### 4.1 Character and Emotion List
|
|
||||||
To obtain the supported characters and their corresponding emotions, please visit the following URL:
|
|
||||||
- URL: `http://127.0.0.1:5000/character_list`
|
|
||||||
- Returns: A JSON format list of characters and corresponding emotions
|
|
||||||
- Method: `GET`
|
|
||||||
|
|
||||||
|
### 4.1 Text-to-Speech
|
||||||
|
|
||||||
|
endpoint: `/tts`
|
||||||
|
GET:
|
||||||
```
|
```
|
||||||
|
http://127.0.0.1:9880/tts?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_lang=zh&ref_audio_path=archive_jingyuan_1.wav&prompt_lang=zh&prompt_text=我是「罗浮」云骑将军景元。不必拘谨,「将军」只是一时的身份,你称呼我景元便可&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true
|
||||||
|
```
|
||||||
|
|
||||||
|
POST:
|
||||||
|
```json
|
||||||
{
|
{
|
||||||
"Hanabi": [
|
"text": "", # str.(required) text to be synthesized
|
||||||
"default",
|
"text_lang": "", # str.(required) language of the text to be synthesized
|
||||||
"Normal",
|
"ref_audio_path": "", # str.(required) reference audio path.
|
||||||
"Yandere",
|
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||||
],
|
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||||
"Hutao": [
|
"top_k": 5, # int.(optional) top k sampling
|
||||||
"default"
|
"top_p": 1, # float.(optional) top p sampling
|
||||||
]
|
"temperature": 1, # float.(optional) temperature for sampling
|
||||||
|
"text_split_method": "cut5", # str.(optional) text split method, see text_segmentation_method.py for details.
|
||||||
|
"batch_size": 1, # int.(optional) batch size for inference
|
||||||
|
"batch_threshold": 0.75, # float.(optional) threshold for batch splitting.
|
||||||
|
"split_bucket": true, # bool.(optional) whether to split the batch into multiple buckets.
|
||||||
|
"speed_factor":1.0, # float.(optional) control the speed of the synthesized audio.
|
||||||
|
"fragment_interval":0.3, # float.(optional) to control the interval of the audio fragment.
|
||||||
|
"seed": -1, # int.(optional) random seed for reproducibility.
|
||||||
|
"media_type": "wav", # str.(optional) media type of the output audio, support "wav", "raw", "ogg", "aac".
|
||||||
|
"streaming_mode": false, # bool.(optional) whether to return a streaming response.
|
||||||
|
"parallel_infer": True, # bool.(optional) whether to use parallel inference.
|
||||||
|
"repetition_penalty": 1.35, # float.(optional) repetition penalty for T2S model.
|
||||||
|
"tts_infer_yaml_path": “GPT_SoVITS/configs/tts_infer.yaml” # str.(optional) tts infer yaml path
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4.2 Text-to-Speech
|
|
||||||
|
|
||||||
- URL: `http://127.0.0.1:5000/tts`
|
|
||||||
- Returns: Audio on success. Error message on failure.
|
|
||||||
- Method: `GET`/`POST`
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"method": "POST",
|
|
||||||
"body": {
|
|
||||||
"character": "${chaName}",
|
|
||||||
"emotion": "${Emotion}",
|
|
||||||
"text": "${speakText}",
|
|
||||||
"text_language": "${textLanguage}",
|
|
||||||
"batch_size": ${batch_size},
|
|
||||||
"speed": ${speed},
|
|
||||||
"top_k": ${topK},
|
|
||||||
"top_p": ${topP},
|
|
||||||
"temperature": ${temperature},
|
|
||||||
"stream": "${stream}",
|
|
||||||
"format": "${Format}",
|
|
||||||
"save_temp": "${saveTemp}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Parameter Explanation
|
|
||||||
|
|
||||||
- **text**: The text to be converted, URL encoding is recommended.
|
|
||||||
- **character**: Character folder name, pay attention to case sensitivity, full/half width, and language.
|
|
||||||
- **emotion**: Character emotion, must be an actually supported emotion of the character, otherwise, the default emotion will be used.
|
|
||||||
- **text_language**: Text language (auto / zh / en / ja), default is multilingual mixed.
|
|
||||||
- **top_k**, **top_p**, **temperature**: GPT model parameters, no need to modify if unfamiliar.
|
|
||||||
|
|
||||||
- **batch_size**: How many batches at a time, can be increased for faster processing if you have a powerful computer, integer, default is 1.
|
|
||||||
- **speed**: Speech speed, default is 1.0.
|
|
||||||
- **save_temp**: Whether to save temporary files, when true, the backend will save the generated audio, and subsequent identical requests will directly return that data, default is false.
|
|
||||||
- **stream**: Whether to stream, when true, audio will be returned sentence by sentence, default is false.
|
|
||||||
- **format**: Format, default is WAV, allows MP3/ WAV/ OGG.
|
|
||||||
|
|
||||||
## 部署tts训练
|
## 部署tts训练
|
||||||
https://github.com/RVC-Boss/GPT-SoVITS
|
https://github.com/RVC-Boss/GPT-SoVITS
|
||||||
根据文档说明部署,将训练后的模型拷到推理服务的trained目录下
|
切换自己训练的模型
|
||||||
|
### 切换GPT模型
|
||||||
|
|
||||||
|
endpoint: `/set_gpt_weights`
|
||||||
|
|
||||||
|
GET:
|
||||||
|
```
|
||||||
|
http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/xxx.ckpt
|
||||||
|
```
|
||||||
|
RESP:
|
||||||
|
成功: 返回"success", http code 200
|
||||||
|
失败: 返回包含错误信息的 json, http code 400
|
||||||
|
|
||||||
|
|
||||||
|
### 切换Sovits模型
|
||||||
|
|
||||||
|
endpoint: `/set_sovits_weights`
|
||||||
|
|
||||||
|
GET:
|
||||||
|
```
|
||||||
|
http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/xxx.pth
|
||||||
|
```
|
||||||
|
|
||||||
|
RESP:
|
||||||
|
成功: 返回"success", http code 200
|
||||||
|
失败: 返回包含错误信息的 json, http code 400
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
## 如果你需要使用autodl 进行部署
|
## 如果你需要使用autodl 进行部署
|
||||||
请使用 https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS 作为基础镜像你能快速进行部署
|
请使用 https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS 作为基础镜像你能快速进行部署
|
||||||
### 下载
|
|
||||||
```
|
|
||||||
https://github.com/X-T-E-R/GPT-SoVITS-Inference
|
|
||||||
```
|
|
||||||
### 安装
|
|
||||||
```
|
|
||||||
cd GPT-SoVITS-Inference
|
|
||||||
pip3 install -r requirements.txt
|
|
||||||
cp -r GPT_SoVITS/pretrained_models/ ./GPT_SoVITS/pretrained_models
|
|
||||||
```
|
|
||||||
|
|
||||||
### 启动api
|
|
||||||
```
|
|
||||||
python3 app.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### 启动webui
|
|
||||||
```
|
|
||||||
python3 webuis/character_manager/webui.py
|
|
||||||
```
|
|
||||||
|
|
28
ttsreal.py
28
ttsreal.py
|
@ -105,22 +105,30 @@ class VoitsTTS(BaseTTS):
|
||||||
self.stream_tts(
|
self.stream_tts(
|
||||||
self.gpt_sovits(
|
self.gpt_sovits(
|
||||||
msg,
|
msg,
|
||||||
self.opt.CHARACTER, #"test", #character
|
self.opt.REF_FILE,
|
||||||
|
self.opt.REF_TEXT,
|
||||||
"zh", #en args.language,
|
"zh", #en args.language,
|
||||||
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
||||||
self.opt.EMOTION, #emotion
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def gpt_sovits(self, text, character, language, server_url, emotion) -> Iterator[bytes]:
|
def gpt_sovits(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
req={}
|
req={
|
||||||
req["text"] = text
|
'text':text,
|
||||||
req["text_language"] = language
|
'text_lang':language,
|
||||||
req["character"] = character
|
'ref_audio_path':reffile,
|
||||||
req["emotion"] = emotion
|
'prompt_text':reftext,
|
||||||
#req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
'prompt_lang':language,
|
||||||
req["stream"] = True
|
'media_type':'raw',
|
||||||
|
'streaming_mode':True
|
||||||
|
}
|
||||||
|
# req["text"] = text
|
||||||
|
# req["text_language"] = language
|
||||||
|
# req["character"] = character
|
||||||
|
# req["emotion"] = emotion
|
||||||
|
# #req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||||
|
# req["streaming_mode"] = True
|
||||||
res = requests.post(
|
res = requests.post(
|
||||||
f"{server_url}/tts",
|
f"{server_url}/tts",
|
||||||
json=req,
|
json=req,
|
||||||
|
|
Loading…
Reference in New Issue