Refactoring tts code

2024-06-02 22:25:19 +08:00 · 2024-06-02 22:25:19 +08:00 · 632409da1e
parent 4e355e9ab9
commit 632409da1e
7 changed files with 311 additions and 281 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,5 @@
-A streaming digital human based on the Ernerf model， realize audio video synchronous dialogue. It can basically achieve commercial effects.  
+Real time interactive streaming digital human， realize audio video synchronous dialogue. It can basically achieve commercial effects.  
-基于ernerf模型的流式数字人，实现音视频同步对话。基本可以达到商用效果
+实时交互流式数字人，实现音视频同步对话。基本可以达到商用效果
 [ernerf效果](https://www.bilibili.com/video/BV1PM4m1y7Q2/)  [musetalk效果](https://www.bilibili.com/video/BV1gm421N7vQ/)
@ -23,17 +23,17 @@ conda create -n nerfstream python=3.10
 conda activate nerfstream
 conda install pytorch==1.12.1 torchvision==0.13.1 cudatoolkit=11.3 -c pytorch
 pip install -r requirements.txt
 #如果只用musetalk模型，不需要安装下面的库
 pip install "git+https://github.com/facebookresearch/pytorch3d.git"
 pip install tensorflow-gpu==2.8.0
 pip install --upgrade "protobuf<=3.20.1"
 pip install --upgrade "edge-tts<=6.1.11"
 ```
 安装常见问题[FAQ](/assets/faq.md)  
 linux cuda环境搭建可以参考这篇文章 https://zhuanlan.zhihu.com/p/674972886
 ## 2. Quick Start
-默认采用webrtc推流到srs  
+默认采用ernerf模型，webrtc推流到srs  
 ### 2.1 运行rtmpserver (srs)
 ```
 export CANDIDATE='<服务器外网ip>'
@ -211,8 +211,9 @@ docker版本已经不是最新代码，可以作为一个空环境，把最新
 - [x] MuseTalk
 - [ ] SyncTalk
-如果本项目对你有帮助，帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。  
+如果本项目对你有帮助，帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。   
 Email: lipku@foxmail.com  
 知识星球: https://t.zsxq.com/7NMyO  
 微信公众号：数字人技术  
-![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyfaiaLZGuMGQXnhLWxibpJUS2gfs8Dje6JuMY8zu2tVyU9n8Zx1yaNncvKHBMibX0ocehoITy5qQEZg/640?wxfrom=12&tp=wxpic&usePicPrefetch=1&wx_fmt=jpeg&amp;from=appmsg)
+![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyfaiaLZGuMGQXnhLWxibpJUS2gfs8Dje6JuMY8zu2tVyU9n8Zx1yaNncvKHBMibX0ocehoITy5qQEZg/640?wxfrom=12&tp=wxpic&usePicPrefetch=1&wx_fmt=jpeg&amp;from=appmsg)  
 Buy me a coffee  
 ![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyEO2TDmroXibUSeFRCB3ftThHyTgVmVYyVVyvqDxronGvoU7xzkztnwQpnM5lBgx4MSaUUrnRZwCw/640?wx_fmt=jpeg&amp;from=appmsg)
--- a/app.py
+++ b/app.py
@ -23,132 +23,11 @@ import argparse
 import shutil
 import asyncio
 import edge_tts
 from typing import Iterator
 import requests
 app = Flask(__name__)
 sockets = Sockets(app)
 global nerfreal
 global tts_type
 global gspeaker
 async def main(voicename: str, text: str, render):
    communicate = edge_tts.Communicate(text, voicename)
    #with open(OUTPUT_FILE, "wb") as file:
    first = True
    async for chunk in communicate.stream():
        if first:
            #render.before_push_audio()
            first = False
        if chunk["type"] == "audio":
            render.push_audio(chunk["data"])
            #file.write(chunk["data"])
        elif chunk["type"] == "WordBoundary":
            pass                
 def get_speaker(ref_audio,server_url):
    files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
    response = requests.post(f"{server_url}/clone_speaker", files=files)
    return response.json()
 def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
    start = time.perf_counter()
    speaker["text"] = text
    speaker["language"] = language
    speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
    res = requests.post(
        f"{server_url}/tts_stream",
        json=speaker,
        stream=True,
    )
    end = time.perf_counter()
    print(f"xtts Time to make POST: {end-start}s")
    if res.status_code != 200:
        print("Error:", res.text)
        return
    first = True
    for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
        if first:
            end = time.perf_counter()
            print(f"xtts Time to first chunk: {end-start}s")
            first = False
        if chunk:
            yield chunk
    print("xtts response.elapsed:", res.elapsed)
 def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
    start = time.perf_counter()
    req={}
    req["text"] = text
    req["text_language"] = language
    req["character"] = character
    req["emotion"] = emotion
    #req["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
    req["stream"] = True
    res = requests.post(
        f"{server_url}/tts",
        json=req,
        stream=True,
    )
    end = time.perf_counter()
    print(f"gpt_sovits Time to make POST: {end-start}s")
    if res.status_code != 200:
        print("Error:", res.text)
        return
    first = True
    for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
        if first:
            end = time.perf_counter()
            print(f"gpt_sovits Time to first chunk: {end-start}s")
            first = False
        if chunk:
            yield chunk
    print("gpt_sovits response.elapsed:", res.elapsed)
 def stream_tts(audio_stream,render):
    for chunk in audio_stream:
        if chunk is not None:
            render.push_audio(chunk)
 def txt_to_audio(text_):
    if tts_type == "edgetts":
        voicename = "zh-CN-YunxiaNeural"
        text = text_
        t = time.time()
        asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
        print(f'-------edge tts time:{time.time()-t:.4f}s')
    elif tts_type == "gpt-sovits": #gpt_sovits
        stream_tts(
            gpt_sovits(
                text_,
                app.config['CHARACTER'], #"test", #character
                "zh", #en args.language,
                app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
                app.config['EMOTION'], #emotion 
            ),
            nerfreal
        )
    else: #xtts
        stream_tts(
            xtts(
                text_,
                gspeaker,
                "zh-cn", #en args.language,
                app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
                "20" #args.stream_chunk_size
            ),
            nerfreal
        )
@sockets.route('/humanecho')
@ -168,7 +47,7 @@ def echo_socket(ws):
            if not message or len(message)==0:
                return '输入信息为空'
            else:                                
-                txt_to_audio(message)
+                nerfreal.put_msg_txt(message)
 def llm_response(message):
@ -198,42 +77,11 @@ def chat_socket(ws):
                return '输入信息为空'
            else:
                res=llm_response(message)                           
-                txt_to_audio(res) 
+                nerfreal.put_msg_txt(res)
 #####webrtc###############################
 pcs = set()
 async def txt_to_audio_async(text_):
    if tts_type == "edgetts":
        voicename = "zh-CN-YunxiaNeural"
        text = text_
        t = time.time()
        #asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
        await main(voicename,text,nerfreal)
        print(f'-------edge tts time:{time.time()-t:.4f}s')
    elif tts_type == "gpt-sovits": #gpt_sovits
        stream_tts(
            gpt_sovits(
                text_,
                app.config['CHARACTER'], #"test", #character
                "zh", #en args.language,
                app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
                app.config['EMOTION'], #emotion 
            ),
            nerfreal
        )
    else: #xtts
        stream_tts(
            xtts(
                text_,
                gspeaker,
                "zh-cn", #en args.language,
                app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
                "20" #args.stream_chunk_size
            ),
            nerfreal
        )
 #@app.route('/offer', methods=['POST'])
 async def offer(request):
    params = await request.json()
@ -271,10 +119,10 @@ async def human(request):
    params = await request.json()
    if params['type']=='echo':
-        await txt_to_audio_async(params['text'])
+        nerfreal.put_msg_txt(params['text'])
    elif params['type']=='chat':
-        res=llm_response(params['text'])                           
+        res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text']))                          
-        await txt_to_audio_async(res)
+        nerfreal.put_msg_txt(res)
    return web.Response(
        content_type="application/json",
@ -453,14 +301,9 @@ if __name__ == '__main__':
    parser.add_argument('--listenport', type=int, default=8010)
    opt = parser.parse_args()
-    app.config.from_object(opt)
+    #app.config.from_object(opt)
    #print(app.config)
    tts_type = opt.tts
    if tts_type == "xtts":
        print("Computing the latents for a new reference...")
        gspeaker = get_speaker(opt.REF_FILE, opt.TTS_SERVER)
    if opt.model == 'ernerf':
        from ernerf.nerf_triplane.provider import NeRFDataset_Test
        from ernerf.nerf_triplane.utils import *
--- a/asrreal.py
+++ b/asrreal.py
@ -56,7 +56,6 @@ class ASR:
        # create input stream
        self.queue = Queue()
        self.input_stream = BytesIO()
        self.output_queue = Queue()
        # start a background process to read frames
        #self.input_stream = self.audio_instance.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, output=False, frames_per_buffer=self.chunk)
@ -204,6 +203,9 @@ class ASR:
        #         np.save(output_path, unfold_feats.cpu().numpy())
        #         print(f"[INFO] saved logits to {output_path}")
    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
        self.queue.put(audio_chunk)
    def __get_audio_frame(self): 
        if self.inwarm: # warm up
            return np.zeros(self.chunk, dtype=np.float32),1
@ -260,56 +262,6 @@ class ASR:
        return logits[0], None,None #predicted_ids[0], transcription # [N,]
    def __create_bytes_stream(self,byte_stream):
        #byte_stream=BytesIO(buffer)
        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
        stream = stream.astype(np.float32)
        if stream.ndim > 1:
            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
            stream = stream[:, 0]
        if sample_rate != self.sample_rate and stream.shape[0]>0:
            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
        return stream
    def push_audio(self,buffer): #push audio pcm from tts
        print(f'[INFO] push_audio {len(buffer)}')
        if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits":
            if len(buffer)>0:            
                stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
                if self.opt.tts == "xtts":
                    stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
                else:
                    stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
                #byte_stream=BytesIO(buffer)
                #stream = self.__create_bytes_stream(byte_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk:
                    self.queue.put(stream[idx:idx+self.chunk])
                    streamlen -= self.chunk
                    idx += self.chunk
                # if streamlen>0: #skip last frame(not 20ms)
                #     self.queue.put(stream[idx:])
        else: #edge tts
            self.input_stream.write(buffer)
            if len(buffer)<=0:
                self.input_stream.seek(0)
                stream = self.__create_bytes_stream(self.input_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk:
                    self.queue.put(stream[idx:idx+self.chunk])
                    streamlen -= self.chunk
                    idx += self.chunk
                #if streamlen>0:  #skip last frame(not 20ms)
                #    self.queue.put(stream[idx:])
                self.input_stream.seek(0)
                self.input_stream.truncate()
    def get_audio_out(self):  #get origin audio pcm to nerf
        return self.output_queue.get()
--- a/museasr.py
+++ b/museasr.py
@ -18,7 +18,7 @@ class MuseASR:
        self.sample_rate = 16000
        self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
        self.queue = Queue()
-        self.input_stream = BytesIO()
+        # self.input_stream = BytesIO()
        self.output_queue = Queue()
        self.audio_processor = audio_processor
@ -29,62 +29,14 @@ class MuseASR:
        self.warm_up()
-    def __create_bytes_stream(self,byte_stream):
+    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
-        #byte_stream=BytesIO(buffer)
+        self.queue.put(audio_chunk)
        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
        stream = stream.astype(np.float32)
        if stream.ndim > 1:
            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
            stream = stream[:, 0]
        if sample_rate != self.sample_rate and stream.shape[0]>0:
            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
        return stream
    def push_audio(self,buffer):
        print(f'[INFO] push_audio {len(buffer)}')
        if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits":
            if len(buffer)>0:            
                stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
                if self.opt.tts == "xtts":
                    stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
                else:
                    stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
                #byte_stream=BytesIO(buffer)
                #stream = self.__create_bytes_stream(byte_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk:
                    self.queue.put(stream[idx:idx+self.chunk])
                    streamlen -= self.chunk
                    idx += self.chunk
                # if streamlen>0: #skip last frame(not 20ms)
                #     self.queue.put(stream[idx:])
        else: #edge tts
            self.input_stream.write(buffer)
            if len(buffer)<=0:
                self.input_stream.seek(0)
                stream = self.__create_bytes_stream(self.input_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk:
                    self.queue.put(stream[idx:idx+self.chunk])
                    streamlen -= self.chunk
                    idx += self.chunk
                #if streamlen>0:  #skip last frame(not 20ms)
                #    self.queue.put(stream[idx:])
                self.input_stream.seek(0)
                self.input_stream.truncate()  
    def __get_audio_frame(self):        
        try:
-            frame = self.queue.get(block=False)
+            frame = self.queue.get(block=True,timeout=0.02)
            type = 0
-            print(f'[INFO] get frame {frame.shape}')
+            #print(f'[INFO] get frame {frame.shape}')
        except queue.Empty:
            frame = np.zeros(self.chunk, dtype=np.float32)
            type = 1
--- a/musereal.py
+++ b/musereal.py
@ -21,6 +21,7 @@ from musetalk.utils.utils import get_file_type,get_video_fps,datagen
 from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder
 from musetalk.utils.blending import get_image,get_image_prepare_material,get_image_blending
 from musetalk.utils.utils import load_all_model
 from ttsreal import EdgeTTS,VoitsTTS,XTTS
 from museasr import MuseASR
 import asyncio
@ -59,6 +60,13 @@ class MuseReal:
        self.__loadavatar()
        self.asr = MuseASR(opt,self.audio_processor)
        if opt.tts == "edgetts":
            self.tts = EdgeTTS(opt,self)
        elif opt.tts == "gpt-sovits":
            self.tts = VoitsTTS(opt,self)
        elif opt.tts == "xtts":
            self.tts = XTTS(opt,self)
        #self.__warm_up()
    def __loadmodels(self):
        # load model weights
@ -83,8 +91,11 @@ class MuseReal:
        self.mask_list_cycle = read_imgs(input_mask_list)
-    def push_audio(self,buffer):
+    def put_msg_txt(self,msg):
-        self.asr.push_audio(buffer)
+        self.tts.put_msg_txt(msg)
    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
        self.asr.put_audio_frame(audio_chunk)
    def __mirror_index(self, index):
        size = len(self.coord_list_cycle)
@ -93,13 +104,37 @@ class MuseReal:
        if turn % 2 == 0:
            return res
        else:
-            return size - res - 1   
+            return size - res - 1  
    def __warm_up(self): 
        self.asr.run_step()
        whisper_chunks = self.asr.get_next_feat()
        whisper_batch = np.stack(whisper_chunks)
        latent_batch = []
        for i in range(self.batch_size):
            idx = self.__mirror_index(self.idx+i)
            latent = self.input_latent_list_cycle[idx]
            latent_batch.append(latent)
        latent_batch = torch.cat(latent_batch, dim=0)
        print('infer=======')
        # for i, (whisper_batch,latent_batch) in enumerate(gen):
        audio_feature_batch = torch.from_numpy(whisper_batch)
        audio_feature_batch = audio_feature_batch.to(device=self.unet.device,
                                                        dtype=self.unet.model.dtype)
        audio_feature_batch = self.pe(audio_feature_batch)
        latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
        pred_latents = self.unet.model(latent_batch, 
                                    self.timesteps, 
                                    encoder_hidden_states=audio_feature_batch).sample
        recon = self.vae.decode_latents(pred_latents)
    def test_step(self,loop=None,audio_track=None,video_track=None):
        # gen = datagen(whisper_chunks,
        #               self.input_latent_list_cycle, 
        #               self.batch_size)
        starttime=time.perf_counter()
        self.asr.run_step()
        whisper_chunks = self.asr.get_next_feat()
        is_all_silence=True
@ -114,7 +149,8 @@ class MuseReal:
                self.res_frame_queue.put((None,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2]))
                self.idx = self.idx + 1
        else:
-            print('infer=======')
+            # print('infer=======')
            t=time.perf_counter()
            whisper_batch = np.stack(whisper_chunks)
            latent_batch = []
            for i in range(self.batch_size):
@ -129,16 +165,22 @@ class MuseReal:
                                                            dtype=self.unet.model.dtype)
            audio_feature_batch = self.pe(audio_feature_batch)
            latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
            # print('prepare time:',time.perf_counter()-t)
            # t=time.perf_counter()
            pred_latents = self.unet.model(latent_batch, 
                                        self.timesteps, 
                                        encoder_hidden_states=audio_feature_batch).sample
            # print('unet time:',time.perf_counter()-t)
            # t=time.perf_counter()
            recon = self.vae.decode_latents(pred_latents)
            # print('vae time:',time.perf_counter()-t)
            #print('diffusion len=',len(recon))
            for i,res_frame in enumerate(recon):
                #self.__pushmedia(res_frame,loop,audio_track,video_track)
                self.res_frame_queue.put((res_frame,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2]))
                self.idx = self.idx + 1
            print('total batch time:',time.perf_counter()-starttime)
    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
@ -175,12 +217,14 @@ class MuseReal:
                new_frame.sample_rate=16000
                # if audio_track._queue.qsize()>10:
                #     time.sleep(0.1)
-                asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop) 
+                asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
        print('musereal process_frames thread stop') 
    def render(self,quit_event,loop=None,audio_track=None,video_track=None):
        #if self.opt.asr:
        #     self.asr.warm_up()
        self.tts.render(quit_event)
        process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track))
        process_thread.start()
@ -207,4 +251,5 @@ class MuseReal:
            # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
            # if delay > 0:
            #     time.sleep(delay)
        print('musereal thread stop')
--- a/nerfreal.py
+++ b/nerfreal.py
@ -10,6 +10,8 @@ import torch.nn.functional as F
 import cv2
 from asrreal import ASR
 from ttsreal import EdgeTTS,VoitsTTS,XTTS
 import asyncio
 from av import AudioFrame, VideoFrame
@ -63,6 +65,12 @@ class NeRFReal:
        if self.opt.asr:
            self.asr = ASR(opt)
            self.asr.warm_up()
        if opt.tts == "edgetts":
            self.tts = EdgeTTS(opt,self)
        elif opt.tts == "gpt-sovits":
            self.tts = VoitsTTS(opt,self)
        elif opt.tts == "xtts":
            self.tts = XTTS(opt,self)
        '''
        video_path = 'video_stream'
@ -110,8 +118,11 @@ class NeRFReal:
        if self.opt.asr:
            self.asr.stop()
-    def push_audio(self,chunk):
+    def put_msg_txt(self,msg):
-        self.asr.push_audio(chunk)   
+        self.tts.put_msg_txt(msg)
    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
        self.asr.put_audio_frame(audio_chunk)   
    def mirror_index(self, index):
@ -231,6 +242,8 @@ class NeRFReal:
        totaltime=0
        _starttime=time.perf_counter()
        _totalframe=0
        self.tts.render(quit_event)
        while not quit_event.is_set(): #todo
            # update texture every frame
            # audio stream thread...
@ -255,5 +268,6 @@ class NeRFReal:
                if video_track._queue.qsize()>=5:
                    #print('sleep qsize=',video_track._queue.qsize())
                    time.sleep(0.1)
        print('nerfreal thread stop')
--- a/ttsreal.py
+++ b/ttsreal.py
@ -0,0 +1,223 @@
 import time
 import numpy as np
 import soundfile as sf
 import resampy
 import asyncio
 import edge_tts
 from typing import Iterator
 import requests
 import queue
 from queue import Queue
 from io import BytesIO
 from threading import Thread, Event
 class BaseTTS:
    def __init__(self, opt, parent):
        self.opt=opt
        self.parent = parent
        self.fps = opt.fps # 20 ms per frame
        self.sample_rate = 16000
        self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
        self.input_stream = BytesIO()
        self.msgqueue = Queue()
    def put_msg_txt(self,msg): 
        self.msgqueue.put(msg)
    def render(self,quit_event):
        process_thread = Thread(target=self.process_tts, args=(quit_event,))
        process_thread.start()
    def process_tts(self,quit_event):        
        while not quit_event.is_set():
            try:
                msg = self.msgqueue.get(block=True, timeout=1)
            except queue.Empty:
                continue
            self.txt_to_audio(msg)
        print('ttsreal thread stop')
    def txt_to_audio(self,msg):
        pass
 ###########################################################################################
 class EdgeTTS(BaseTTS):
    def txt_to_audio(self,msg):
        voicename = "zh-CN-YunxiaNeural"
        text = msg
        t = time.time()
        asyncio.new_event_loop().run_until_complete(self.__main(voicename,text))
        print(f'-------edge tts time:{time.time()-t:.4f}s')
        self.input_stream.seek(0)
        stream = self.__create_bytes_stream(self.input_stream)
        streamlen = stream.shape[0]
        idx=0
        while streamlen >= self.chunk:
            self.parent.put_audio_frame(stream[idx:idx+self.chunk])
            streamlen -= self.chunk
            idx += self.chunk
        #if streamlen>0:  #skip last frame(not 20ms)
        #    self.queue.put(stream[idx:])
        self.input_stream.seek(0)
        self.input_stream.truncate() 
    def __create_bytes_stream(self,byte_stream):
        #byte_stream=BytesIO(buffer)
        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
        stream = stream.astype(np.float32)
        if stream.ndim > 1:
            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
            stream = stream[:, 0]
        if sample_rate != self.sample_rate and stream.shape[0]>0:
            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
        return stream
    async def __main(self,voicename: str, text: str):
        communicate = edge_tts.Communicate(text, voicename)
        #with open(OUTPUT_FILE, "wb") as file:
        first = True
        async for chunk in communicate.stream():
            if first:
                first = False
            if chunk["type"] == "audio":
                #self.push_audio(chunk["data"])
                self.input_stream.write(chunk["data"])
                #file.write(chunk["data"])
            elif chunk["type"] == "WordBoundary":
                pass
 ###########################################################################################
 class VoitsTTS(BaseTTS):
    def txt_to_audio(self,msg): 
        self.stream_tts(
            self.gpt_sovits(
                msg,
                self.opt.CHARACTER, #"test", #character
                "zh", #en args.language,
                self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
                self.opt.EMOTION, #emotion 
            )
        )
    def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
        start = time.perf_counter()
        req={}
        req["text"] = text
        req["text_language"] = language
        req["character"] = character
        req["emotion"] = emotion
        #req["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
        req["stream"] = True
        res = requests.post(
            f"{server_url}/tts",
            json=req,
            stream=True,
        )
        end = time.perf_counter()
        print(f"gpt_sovits Time to make POST: {end-start}s")
        if res.status_code != 200:
            print("Error:", res.text)
            return
        first = True
        for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
            if first:
                end = time.perf_counter()
                print(f"gpt_sovits Time to first chunk: {end-start}s")
                first = False
            if chunk:
                yield chunk
        print("gpt_sovits response.elapsed:", res.elapsed)
    def stream_tts(self,audio_stream):
        for chunk in audio_stream:
            if chunk is not None and len(chunk)>0:          
                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
                stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
                #byte_stream=BytesIO(buffer)
                #stream = self.__create_bytes_stream(byte_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk:
                    self.parent.put_audio_frame(stream[idx:idx+self.chunk])
                    streamlen -= self.chunk
                    idx += self.chunk 
 ###########################################################################################
 class XTTS(BaseTTS):
    def __init__(self, opt, parent):
        super().__init__(opt,parent)
        self.speaker = self.get_speaker(opt.REF_FILE, opt.TTS_SERVER)
    def txt_to_audio(self,msg): 
        self.stream_tts(
            self.xtts(
                msg,
                self.speaker,
                "zh-cn", #en args.language,
                self.opt.TTS_SERVER, #"http://localhost:9000", #args.server_url,
                "20" #args.stream_chunk_size
            )
        )
    def get_speaker(self,ref_audio,server_url):
        files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
        response = requests.post(f"{server_url}/clone_speaker", files=files)
        return response.json()
    def xtts(self,text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
        start = time.perf_counter()
        speaker["text"] = text
        speaker["language"] = language
        speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
        res = requests.post(
            f"{server_url}/tts_stream",
            json=speaker,
            stream=True,
        )
        end = time.perf_counter()
        print(f"xtts Time to make POST: {end-start}s")
        if res.status_code != 200:
            print("Error:", res.text)
            return
        first = True
        for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
            if first:
                end = time.perf_counter()
                print(f"xtts Time to first chunk: {end-start}s")
                first = False
            if chunk:
                yield chunk
        print("xtts response.elapsed:", res.elapsed)
    def stream_tts(self,audio_stream):
        for chunk in audio_stream:
            if chunk is not None and len(chunk)>0:          
                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
                stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
                #byte_stream=BytesIO(buffer)
                #stream = self.__create_bytes_stream(byte_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk:
                    self.parent.put_audio_frame(stream[idx:idx+self.chunk])
                    streamlen -= self.chunk
                    idx += self.chunk