From 2e64be4b5d403977d16610e7e19a212e68ead922 Mon Sep 17 00:00:00 2001 From: lipku Date: Sun, 21 Apr 2024 17:09:08 +0800 Subject: [PATCH] add support gpt-sovits --- app.py | 44 +++++++++++++++++++------------- asrreal.py | 7 ++++-- tts/README.md | 70 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/app.py b/app.py index e301977..48f7e57 100644 --- a/app.py +++ b/app.py @@ -75,7 +75,7 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt return first = True - for chunk in res.iter_content(chunk_size=960): + for chunk in res.iter_content(chunk_size=960): #24K*20ms*2 if first: end = time.perf_counter() print(f"xtts Time to first chunk: {end-start}s") @@ -85,12 +85,20 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt print("xtts response.elapsed:", res.elapsed) -def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]: +def gpt_sovits(text, character, language, server_url, stream_chunk_size) -> Iterator[bytes]: start = time.perf_counter() - speaker["text"] = text - speaker["language"] = language - speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality - res = requests.get(f"{server_url}&text="+text,stream=True) + req={} + req["text"] = text + req["text_language"] = language + req["character"] = character + #req["emotion"] = emotion + #req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality + req["stream"] = True + res = requests.post( + f"{server_url}/tts", + json=req, + stream=True, + ) end = time.perf_counter() print(f"gpt_sovits Time to make POST: {end-start}s") @@ -99,7 +107,7 @@ def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterat return first = True - for chunk in res.iter_content(chunk_size=960): + for chunk in res.iter_content(chunk_size=1280): #32K*20ms*2 if first: end = time.perf_counter() print(f"gpt_sovits Time to first chunk: {end-start}s") @@ -109,7 +117,7 @@ def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterat print("gpt_sovits response.elapsed:", res.elapsed) -def stream_xtts(audio_stream,render): +def stream_tts(audio_stream,render): for chunk in audio_stream: if chunk is not None: render.push_audio(chunk) @@ -121,19 +129,19 @@ def txt_to_audio(text_): t = time.time() asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal)) print(f'-------edge tts time:{time.time()-t:.4f}s') - elif tts_type == "gpt": #gpt_sovits - stream_xtts( + elif tts_type == "gpt-sovits": #gpt_sovits + stream_tts( gpt_sovits( text_, - gspeaker, - "zh-cn", #en args.language, - "http://127.0.0.1:9880/tts_ava?ava=maimai&streaming_mode=true", #args.server_url, + "test", #character + "zh", #en args.language, + "http://127.0.0.1:5000", #args.server_url, "20" #args.stream_chunk_size ), nerfreal ) else: #xtts - stream_xtts( + stream_tts( xtts( text_, gspeaker, @@ -354,18 +362,18 @@ if __name__ == '__main__': parser.add_argument('--fullbody_offset_x', type=int, default=0) parser.add_argument('--fullbody_offset_y', type=int, default=0) - parser.add_argument('--tts', type=str, default='edgetts') #xtts + parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits parser.add_argument('--ref_file', type=str, default=None) - parser.add_argument('--xtts_server', type=str, default='http://localhost:9000') + parser.add_argument('--tts_server', type=str, default='http://localhost:9000') opt = parser.parse_args() app.config.from_object(opt) - #print(app.config['xtts_server']) + #print(app.config['tts_server']) tts_type = opt.tts if tts_type == "xtts": print("Computing the latents for a new reference...") - gspeaker = get_speaker(opt.ref_file, opt.xtts_server) + gspeaker = get_speaker(opt.ref_file, opt.tts_server) # assert test mode opt.test = True diff --git a/asrreal.py b/asrreal.py index 5d8fea8..e29b3eb 100644 --- a/asrreal.py +++ b/asrreal.py @@ -314,10 +314,13 @@ class ASR: def push_audio(self,buffer): #push audio pcm from tts print(f'[INFO] push_audio {len(buffer)}') - if self.opt.tts == "xtts": + if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits": if len(buffer)>0: stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767 - stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate) + if self.opt.tts == "xtts": + stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate) + else: + stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate) #byte_stream=BytesIO(buffer) #stream = self.__create_bytes_stream(byte_stream) streamlen = stream.shape[0] diff --git a/tts/README.md b/tts/README.md index f2f10b4..b3cba8d 100644 --- a/tts/README.md +++ b/tts/README.md @@ -1,27 +1,51 @@ -一、采用gpt-sovits方案,bert-sovits适合长音频训练,gpt-sovits运行短音频快速推理 -下载tts服务端代码 -https://github.com/yanyuxiyangzk/GPT-SoVITS/tree/fast_inference_ -api_v2.py即启动的服务端代码,也可以打开声音克隆界面进行训练,可以训练带感情语气等 +# 采用gpt-sovits方案,bert-sovits适合长音频训练,gpt-sovits运行短音频快速推理 +## 部署tts推理 +git clone https://github.com/X-T-E-R/GPT-SoVITS-Inference.git -1、启动 -python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml +1. 安装依赖库 +``` +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh +``` +从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将它们放置在 `GPT_SoVITS\pretrained_models` 中 + +2. Model Folder Format +模型文件下载地址 https://www.yuque.com/xter/zibxlp/gsximn7ditzgispg +下载的模型文件放到trained目录下, 如 `trained/Character1/` +Put the pth / ckpt / wav files in it, the wav should be named as the prompt text +Like : + +``` +trained +--hutao +----hutao-e75.ckpt +----hutao_e60_s3360.pth +----hutao said something.wav +``` + +3. 启动 +后端接口: python Inference/src/tts_backend.py +如果有错误提示找不到cmudict,从这下载https://github.com/nltk/nltk_data,将packages改名为nltk_data放到home目录下 +管理页面: python Inference/src/TTS_Webui.py, 浏览器打开可以管理character和emotion -http://127.0.0.1:9880/set_sovits_weights?weights_path=SoVITS_weights/maimai_e55_s1210.pth -http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_weights/maimai-e21.ckpt +4. 接口测试 + Character and Emotion List +To obtain the supported characters and their corresponding emotions, please visit the following URL: +- URL: `http://127.0.0.1:5000/character_list` +- Returns: A JSON format list of characters and corresponding emotions +- Method: `GET` - -2、接口测试 -http://127.0.0.1:9880/set_ava?ava=maimai - -http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样 - -http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样&streaming_mode=true - -http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。&text_lang=zh&ref_audio_path=mengpai.wav&prompt_lang=zh&prompt_text=呜哇好生气啊!不要把我跟一斗相提并论!&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true - -3、使用 -设置角色 -http://127.0.0.1:9880/set_ava?ava=maimai -tts接口 -http://127.0.0.1:9880/tts_ava?ava=maimai&text=我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样&streaming_mode=true \ No newline at end of file +``` +{ + "Hanabi": [ + "default", + "Normal", + "Yandere", + ], + "Hutao": [ + "default" + ] +} +``` \ No newline at end of file