add support gpt-sovits
This commit is contained in:
parent
6d4952c1bf
commit
2e64be4b5d
44
app.py
44
app.py
|
@ -75,7 +75,7 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt
|
||||||
return
|
return
|
||||||
|
|
||||||
first = True
|
first = True
|
||||||
for chunk in res.iter_content(chunk_size=960):
|
for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
|
||||||
if first:
|
if first:
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
print(f"xtts Time to first chunk: {end-start}s")
|
print(f"xtts Time to first chunk: {end-start}s")
|
||||||
|
@ -85,12 +85,20 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt
|
||||||
|
|
||||||
print("xtts response.elapsed:", res.elapsed)
|
print("xtts response.elapsed:", res.elapsed)
|
||||||
|
|
||||||
def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
def gpt_sovits(text, character, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
speaker["text"] = text
|
req={}
|
||||||
speaker["language"] = language
|
req["text"] = text
|
||||||
speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
req["text_language"] = language
|
||||||
res = requests.get(f"{server_url}&text="+text,stream=True)
|
req["character"] = character
|
||||||
|
#req["emotion"] = emotion
|
||||||
|
#req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||||
|
req["stream"] = True
|
||||||
|
res = requests.post(
|
||||||
|
f"{server_url}/tts",
|
||||||
|
json=req,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
print(f"gpt_sovits Time to make POST: {end-start}s")
|
print(f"gpt_sovits Time to make POST: {end-start}s")
|
||||||
|
|
||||||
|
@ -99,7 +107,7 @@ def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterat
|
||||||
return
|
return
|
||||||
|
|
||||||
first = True
|
first = True
|
||||||
for chunk in res.iter_content(chunk_size=960):
|
for chunk in res.iter_content(chunk_size=1280): #32K*20ms*2
|
||||||
if first:
|
if first:
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
||||||
|
@ -109,7 +117,7 @@ def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterat
|
||||||
|
|
||||||
print("gpt_sovits response.elapsed:", res.elapsed)
|
print("gpt_sovits response.elapsed:", res.elapsed)
|
||||||
|
|
||||||
def stream_xtts(audio_stream,render):
|
def stream_tts(audio_stream,render):
|
||||||
for chunk in audio_stream:
|
for chunk in audio_stream:
|
||||||
if chunk is not None:
|
if chunk is not None:
|
||||||
render.push_audio(chunk)
|
render.push_audio(chunk)
|
||||||
|
@ -121,19 +129,19 @@ def txt_to_audio(text_):
|
||||||
t = time.time()
|
t = time.time()
|
||||||
asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
||||||
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
||||||
elif tts_type == "gpt": #gpt_sovits
|
elif tts_type == "gpt-sovits": #gpt_sovits
|
||||||
stream_xtts(
|
stream_tts(
|
||||||
gpt_sovits(
|
gpt_sovits(
|
||||||
text_,
|
text_,
|
||||||
gspeaker,
|
"test", #character
|
||||||
"zh-cn", #en args.language,
|
"zh", #en args.language,
|
||||||
"http://127.0.0.1:9880/tts_ava?ava=maimai&streaming_mode=true", #args.server_url,
|
"http://127.0.0.1:5000", #args.server_url,
|
||||||
"20" #args.stream_chunk_size
|
"20" #args.stream_chunk_size
|
||||||
),
|
),
|
||||||
nerfreal
|
nerfreal
|
||||||
)
|
)
|
||||||
else: #xtts
|
else: #xtts
|
||||||
stream_xtts(
|
stream_tts(
|
||||||
xtts(
|
xtts(
|
||||||
text_,
|
text_,
|
||||||
gspeaker,
|
gspeaker,
|
||||||
|
@ -354,18 +362,18 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--fullbody_offset_x', type=int, default=0)
|
parser.add_argument('--fullbody_offset_x', type=int, default=0)
|
||||||
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
||||||
|
|
||||||
parser.add_argument('--tts', type=str, default='edgetts') #xtts
|
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
||||||
parser.add_argument('--ref_file', type=str, default=None)
|
parser.add_argument('--ref_file', type=str, default=None)
|
||||||
parser.add_argument('--xtts_server', type=str, default='http://localhost:9000')
|
parser.add_argument('--tts_server', type=str, default='http://localhost:9000')
|
||||||
|
|
||||||
opt = parser.parse_args()
|
opt = parser.parse_args()
|
||||||
app.config.from_object(opt)
|
app.config.from_object(opt)
|
||||||
#print(app.config['xtts_server'])
|
#print(app.config['tts_server'])
|
||||||
|
|
||||||
tts_type = opt.tts
|
tts_type = opt.tts
|
||||||
if tts_type == "xtts":
|
if tts_type == "xtts":
|
||||||
print("Computing the latents for a new reference...")
|
print("Computing the latents for a new reference...")
|
||||||
gspeaker = get_speaker(opt.ref_file, opt.xtts_server)
|
gspeaker = get_speaker(opt.ref_file, opt.tts_server)
|
||||||
|
|
||||||
# assert test mode
|
# assert test mode
|
||||||
opt.test = True
|
opt.test = True
|
||||||
|
|
|
@ -314,10 +314,13 @@ class ASR:
|
||||||
|
|
||||||
def push_audio(self,buffer): #push audio pcm from tts
|
def push_audio(self,buffer): #push audio pcm from tts
|
||||||
print(f'[INFO] push_audio {len(buffer)}')
|
print(f'[INFO] push_audio {len(buffer)}')
|
||||||
if self.opt.tts == "xtts":
|
if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits":
|
||||||
if len(buffer)>0:
|
if len(buffer)>0:
|
||||||
stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
|
stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
|
||||||
|
if self.opt.tts == "xtts":
|
||||||
stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
||||||
|
else:
|
||||||
|
stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
|
||||||
#byte_stream=BytesIO(buffer)
|
#byte_stream=BytesIO(buffer)
|
||||||
#stream = self.__create_bytes_stream(byte_stream)
|
#stream = self.__create_bytes_stream(byte_stream)
|
||||||
streamlen = stream.shape[0]
|
streamlen = stream.shape[0]
|
||||||
|
|
|
@ -1,27 +1,51 @@
|
||||||
一、采用gpt-sovits方案,bert-sovits适合长音频训练,gpt-sovits运行短音频快速推理
|
# 采用gpt-sovits方案,bert-sovits适合长音频训练,gpt-sovits运行短音频快速推理
|
||||||
下载tts服务端代码
|
## 部署tts推理
|
||||||
https://github.com/yanyuxiyangzk/GPT-SoVITS/tree/fast_inference_
|
git clone https://github.com/X-T-E-R/GPT-SoVITS-Inference.git
|
||||||
api_v2.py即启动的服务端代码,也可以打开声音克隆界面进行训练,可以训练带感情语气等
|
|
||||||
|
|
||||||
1、启动
|
1. 安装依赖库
|
||||||
python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml
|
```
|
||||||
|
conda create -n GPTSoVits python=3.9
|
||||||
|
conda activate GPTSoVits
|
||||||
|
bash install.sh
|
||||||
|
```
|
||||||
|
从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS\pretrained_models` 中
|
||||||
|
|
||||||
|
2. Model Folder Format
|
||||||
|
模型文件下载地址 https://www.yuque.com/xter/zibxlp/gsximn7ditzgispg
|
||||||
|
下载的模型文件放到trained目录下, 如 `trained/Character1/`
|
||||||
|
Put the pth / ckpt / wav files in it, the wav should be named as the prompt text
|
||||||
|
Like :
|
||||||
|
|
||||||
|
```
|
||||||
|
trained
|
||||||
|
--hutao
|
||||||
|
----hutao-e75.ckpt
|
||||||
|
----hutao_e60_s3360.pth
|
||||||
|
----hutao said something.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
3. 启动
|
||||||
|
后端接口: python Inference/src/tts_backend.py
|
||||||
|
如果有错误提示找不到cmudict,从这下载https://github.com/nltk/nltk_data,将packages改名为nltk_data放到home目录下
|
||||||
|
管理页面: python Inference/src/TTS_Webui.py, 浏览器打开可以管理character和emotion
|
||||||
|
|
||||||
|
|
||||||
http://127.0.0.1:9880/set_sovits_weights?weights_path=SoVITS_weights/maimai_e55_s1210.pth
|
4. 接口测试
|
||||||
http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_weights/maimai-e21.ckpt
|
Character and Emotion List
|
||||||
|
To obtain the supported characters and their corresponding emotions, please visit the following URL:
|
||||||
|
- URL: `http://127.0.0.1:5000/character_list`
|
||||||
|
- Returns: A JSON format list of characters and corresponding emotions
|
||||||
|
- Method: `GET`
|
||||||
|
|
||||||
|
```
|
||||||
2、接口测试
|
{
|
||||||
http://127.0.0.1:9880/set_ava?ava=maimai
|
"Hanabi": [
|
||||||
|
"default",
|
||||||
http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样
|
"Normal",
|
||||||
|
"Yandere",
|
||||||
http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样&streaming_mode=true
|
],
|
||||||
|
"Hutao": [
|
||||||
http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。&text_lang=zh&ref_audio_path=mengpai.wav&prompt_lang=zh&prompt_text=呜哇好生气啊!不要把我跟一斗相提并论!&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true
|
"default"
|
||||||
|
]
|
||||||
3、使用
|
}
|
||||||
设置角色
|
```
|
||||||
http://127.0.0.1:9880/set_ava?ava=maimai
|
|
||||||
tts接口
|
|
||||||
http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样&streaming_mode=true
|
|
Loading…
Reference in New Issue