From 632409da1eefdf9cae700dd918e15b292a2e9570 Mon Sep 17 00:00:00 2001
From: lipku <lipku@foxmail.com>
Date: Sun, 2 Jun 2024 22:25:19 +0800
Subject: [PATCH] Refactoring tts code

---
 README.md   |  15 ++--
 app.py      | 169 ++-------------------------------------
 asrreal.py  |  54 +------------
 museasr.py  |  58 ++------------
 musereal.py |  55 +++++++++++--
 nerfreal.py |  18 ++++-
 ttsreal.py  | 223 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 311 insertions(+), 281 deletions(-)
 create mode 100644 ttsreal.py

diff --git a/README.md b/README.md
index 65a3d0a..2b5b559 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-A streaming digital human based on the Ernerf model， realize audio video synchronous dialogue. It can basically achieve commercial effects.  
-基于ernerf模型的流式数字人，实现音视频同步对话。基本可以达到商用效果
+Real time interactive streaming digital human， realize audio video synchronous dialogue. It can basically achieve commercial effects.  
+实时交互流式数字人，实现音视频同步对话。基本可以达到商用效果
 
 [ernerf效果](https://www.bilibili.com/video/BV1PM4m1y7Q2/)  [musetalk效果](https://www.bilibili.com/video/BV1gm421N7vQ/)
 
@@ -23,17 +23,17 @@ conda create -n nerfstream python=3.10
 conda activate nerfstream
 conda install pytorch==1.12.1 torchvision==0.13.1 cudatoolkit=11.3 -c pytorch
 pip install -r requirements.txt
+#如果只用musetalk模型，不需要安装下面的库
 pip install "git+https://github.com/facebookresearch/pytorch3d.git"
 pip install tensorflow-gpu==2.8.0
 pip install --upgrade "protobuf<=3.20.1"
-pip install --upgrade "edge-tts<=6.1.11"
 ```
 安装常见问题[FAQ](/assets/faq.md)  
 linux cuda环境搭建可以参考这篇文章 https://zhuanlan.zhihu.com/p/674972886
 
 
 ## 2. Quick Start
-默认采用webrtc推流到srs  
+默认采用ernerf模型，webrtc推流到srs  
 ### 2.1 运行rtmpserver (srs)
 ```
 export CANDIDATE='<服务器外网ip>'
@@ -211,8 +211,9 @@ docker版本已经不是最新代码，可以作为一个空环境，把最新
 - [x] MuseTalk
 - [ ] SyncTalk
 
-如果本项目对你有帮助，帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。  
-Email: lipku@foxmail.com  
+如果本项目对你有帮助，帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。   
 知识星球: https://t.zsxq.com/7NMyO  
 微信公众号：数字人技术  
-![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyfaiaLZGuMGQXnhLWxibpJUS2gfs8Dje6JuMY8zu2tVyU9n8Zx1yaNncvKHBMibX0ocehoITy5qQEZg/640?wxfrom=12&tp=wxpic&usePicPrefetch=1&wx_fmt=jpeg&amp;from=appmsg)
+![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyfaiaLZGuMGQXnhLWxibpJUS2gfs8Dje6JuMY8zu2tVyU9n8Zx1yaNncvKHBMibX0ocehoITy5qQEZg/640?wxfrom=12&tp=wxpic&usePicPrefetch=1&wx_fmt=jpeg&amp;from=appmsg)  
+Buy me a coffee  
+![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyEO2TDmroXibUSeFRCB3ftThHyTgVmVYyVVyvqDxronGvoU7xzkztnwQpnM5lBgx4MSaUUrnRZwCw/640?wx_fmt=jpeg&amp;from=appmsg)
diff --git a/app.py b/app.py
index c1a099b..a552435 100644
--- a/app.py
+++ b/app.py
@@ -23,132 +23,11 @@ import argparse
 
 import shutil
 import asyncio
-import edge_tts
-from typing import Iterator
 
-import requests
 
 app = Flask(__name__)
 sockets = Sockets(app)
 global nerfreal
-global tts_type
-global gspeaker
-
-
-async def main(voicename: str, text: str, render):
-    communicate = edge_tts.Communicate(text, voicename)
-
-    #with open(OUTPUT_FILE, "wb") as file:
-    first = True
-    async for chunk in communicate.stream():
-        if first:
-            #render.before_push_audio()
-            first = False
-        if chunk["type"] == "audio":
-            render.push_audio(chunk["data"])
-            #file.write(chunk["data"])
-        elif chunk["type"] == "WordBoundary":
-            pass                
-
-def get_speaker(ref_audio,server_url):
-    files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
-    response = requests.post(f"{server_url}/clone_speaker", files=files)
-    return response.json()
-
-def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
-    start = time.perf_counter()
-    speaker["text"] = text
-    speaker["language"] = language
-    speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
-    res = requests.post(
-        f"{server_url}/tts_stream",
-        json=speaker,
-        stream=True,
-    )
-    end = time.perf_counter()
-    print(f"xtts Time to make POST: {end-start}s")
-
-    if res.status_code != 200:
-        print("Error:", res.text)
-        return
-
-    first = True
-    for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
-        if first:
-            end = time.perf_counter()
-            print(f"xtts Time to first chunk: {end-start}s")
-            first = False
-        if chunk:
-            yield chunk
-
-    print("xtts response.elapsed:", res.elapsed)
-
-def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
-    start = time.perf_counter()
-    req={}
-    req["text"] = text
-    req["text_language"] = language
-    req["character"] = character
-    req["emotion"] = emotion
-    #req["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
-    req["stream"] = True
-    res = requests.post(
-        f"{server_url}/tts",
-        json=req,
-        stream=True,
-    )
-    end = time.perf_counter()
-    print(f"gpt_sovits Time to make POST: {end-start}s")
-
-    if res.status_code != 200:
-        print("Error:", res.text)
-        return
-        
-    first = True
-    for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
-        if first:
-            end = time.perf_counter()
-            print(f"gpt_sovits Time to first chunk: {end-start}s")
-            first = False
-        if chunk:
-            yield chunk
-
-    print("gpt_sovits response.elapsed:", res.elapsed)
-
-def stream_tts(audio_stream,render):
-    for chunk in audio_stream:
-        if chunk is not None:
-            render.push_audio(chunk)
-
-def txt_to_audio(text_):
-    if tts_type == "edgetts":
-        voicename = "zh-CN-YunxiaNeural"
-        text = text_
-        t = time.time()
-        asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
-        print(f'-------edge tts time:{time.time()-t:.4f}s')
-    elif tts_type == "gpt-sovits": #gpt_sovits
-        stream_tts(
-            gpt_sovits(
-                text_,
-                app.config['CHARACTER'], #"test", #character
-                "zh", #en args.language,
-                app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
-                app.config['EMOTION'], #emotion 
-            ),
-            nerfreal
-        )
-    else: #xtts
-        stream_tts(
-            xtts(
-                text_,
-                gspeaker,
-                "zh-cn", #en args.language,
-                app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
-                "20" #args.stream_chunk_size
-            ),
-            nerfreal
-        )
 
     
 @sockets.route('/humanecho')
@@ -168,7 +47,7 @@ def echo_socket(ws):
             if not message or len(message)==0:
                 return '输入信息为空'
             else:                                
-                txt_to_audio(message)
+                nerfreal.put_msg_txt(message)
 
 
 def llm_response(message):
@@ -198,42 +77,11 @@ def chat_socket(ws):
                 return '输入信息为空'
             else:
                 res=llm_response(message)                           
-                txt_to_audio(res) 
+                nerfreal.put_msg_txt(res)
 
 #####webrtc###############################
 pcs = set()
 
-async def txt_to_audio_async(text_):
-    if tts_type == "edgetts":
-        voicename = "zh-CN-YunxiaNeural"
-        text = text_
-        t = time.time()
-        #asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
-        await main(voicename,text,nerfreal)
-        print(f'-------edge tts time:{time.time()-t:.4f}s')
-    elif tts_type == "gpt-sovits": #gpt_sovits
-        stream_tts(
-            gpt_sovits(
-                text_,
-                app.config['CHARACTER'], #"test", #character
-                "zh", #en args.language,
-                app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
-                app.config['EMOTION'], #emotion 
-            ),
-            nerfreal
-        )
-    else: #xtts
-        stream_tts(
-            xtts(
-                text_,
-                gspeaker,
-                "zh-cn", #en args.language,
-                app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
-                "20" #args.stream_chunk_size
-            ),
-            nerfreal
-        )
-
 #@app.route('/offer', methods=['POST'])
 async def offer(request):
     params = await request.json()
@@ -271,10 +119,10 @@ async def human(request):
     params = await request.json()
 
     if params['type']=='echo':
-        await txt_to_audio_async(params['text'])
+        nerfreal.put_msg_txt(params['text'])
     elif params['type']=='chat':
-        res=llm_response(params['text'])                           
-        await txt_to_audio_async(res)
+        res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text']))                          
+        nerfreal.put_msg_txt(res)
 
     return web.Response(
         content_type="application/json",
@@ -453,14 +301,9 @@ if __name__ == '__main__':
     parser.add_argument('--listenport', type=int, default=8010)
 
     opt = parser.parse_args()
-    app.config.from_object(opt)
+    #app.config.from_object(opt)
     #print(app.config)
 
-    tts_type = opt.tts
-    if tts_type == "xtts":
-        print("Computing the latents for a new reference...")
-        gspeaker = get_speaker(opt.REF_FILE, opt.TTS_SERVER)
-
     if opt.model == 'ernerf':
         from ernerf.nerf_triplane.provider import NeRFDataset_Test
         from ernerf.nerf_triplane.utils import *
diff --git a/asrreal.py b/asrreal.py
index f7beb38..b3e4093 100644
--- a/asrreal.py
+++ b/asrreal.py
@@ -56,7 +56,6 @@ class ASR:
 
         # create input stream
         self.queue = Queue()
-        self.input_stream = BytesIO()
         self.output_queue = Queue()
         # start a background process to read frames
         #self.input_stream = self.audio_instance.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, output=False, frames_per_buffer=self.chunk)
@@ -204,6 +203,9 @@ class ASR:
         #         np.save(output_path, unfold_feats.cpu().numpy())
         #         print(f"[INFO] saved logits to {output_path}")
     
+    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
+        self.queue.put(audio_chunk)
+        
     def __get_audio_frame(self): 
         if self.inwarm: # warm up
             return np.zeros(self.chunk, dtype=np.float32),1
@@ -260,56 +262,6 @@ class ASR:
 
         return logits[0], None,None #predicted_ids[0], transcription # [N,]
     
-    def __create_bytes_stream(self,byte_stream):
-        #byte_stream=BytesIO(buffer)
-        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
-        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
-        stream = stream.astype(np.float32)
-
-        if stream.ndim > 1:
-            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
-            stream = stream[:, 0]
-    
-        if sample_rate != self.sample_rate and stream.shape[0]>0:
-            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
-            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
-
-        return stream
-
-    def push_audio(self,buffer): #push audio pcm from tts
-        print(f'[INFO] push_audio {len(buffer)}')
-        if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits":
-            if len(buffer)>0:            
-                stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
-                if self.opt.tts == "xtts":
-                    stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
-                else:
-                    stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
-                #byte_stream=BytesIO(buffer)
-                #stream = self.__create_bytes_stream(byte_stream)
-                streamlen = stream.shape[0]
-                idx=0
-                while streamlen >= self.chunk:
-                    self.queue.put(stream[idx:idx+self.chunk])
-                    streamlen -= self.chunk
-                    idx += self.chunk
-                # if streamlen>0: #skip last frame(not 20ms)
-                #     self.queue.put(stream[idx:])
-        else: #edge tts
-            self.input_stream.write(buffer)
-            if len(buffer)<=0:
-                self.input_stream.seek(0)
-                stream = self.__create_bytes_stream(self.input_stream)
-                streamlen = stream.shape[0]
-                idx=0
-                while streamlen >= self.chunk:
-                    self.queue.put(stream[idx:idx+self.chunk])
-                    streamlen -= self.chunk
-                    idx += self.chunk
-                #if streamlen>0:  #skip last frame(not 20ms)
-                #    self.queue.put(stream[idx:])
-                self.input_stream.seek(0)
-                self.input_stream.truncate()
     
     def get_audio_out(self):  #get origin audio pcm to nerf
         return self.output_queue.get()
diff --git a/museasr.py b/museasr.py
index 6d65147..c9a3a82 100644
--- a/museasr.py
+++ b/museasr.py
@@ -18,7 +18,7 @@ class MuseASR:
         self.sample_rate = 16000
         self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
         self.queue = Queue()
-        self.input_stream = BytesIO()
+        # self.input_stream = BytesIO()
         self.output_queue = Queue()
 
         self.audio_processor = audio_processor
@@ -29,62 +29,14 @@ class MuseASR:
 
         self.warm_up()
 
-    def __create_bytes_stream(self,byte_stream):
-        #byte_stream=BytesIO(buffer)
-        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
-        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
-        stream = stream.astype(np.float32)
-
-        if stream.ndim > 1:
-            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
-            stream = stream[:, 0]
-    
-        if sample_rate != self.sample_rate and stream.shape[0]>0:
-            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
-            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
-
-        return stream
-
-    def push_audio(self,buffer):
-        print(f'[INFO] push_audio {len(buffer)}')
-        if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits":
-            if len(buffer)>0:            
-                stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
-                if self.opt.tts == "xtts":
-                    stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
-                else:
-                    stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
-                #byte_stream=BytesIO(buffer)
-                #stream = self.__create_bytes_stream(byte_stream)
-                streamlen = stream.shape[0]
-                idx=0
-                while streamlen >= self.chunk:
-                    self.queue.put(stream[idx:idx+self.chunk])
-                    streamlen -= self.chunk
-                    idx += self.chunk
-                # if streamlen>0: #skip last frame(not 20ms)
-                #     self.queue.put(stream[idx:])
-        else: #edge tts
-            self.input_stream.write(buffer)
-            if len(buffer)<=0:
-                self.input_stream.seek(0)
-                stream = self.__create_bytes_stream(self.input_stream)
-                streamlen = stream.shape[0]
-                idx=0
-                while streamlen >= self.chunk:
-                    self.queue.put(stream[idx:idx+self.chunk])
-                    streamlen -= self.chunk
-                    idx += self.chunk
-                #if streamlen>0:  #skip last frame(not 20ms)
-                #    self.queue.put(stream[idx:])
-                self.input_stream.seek(0)
-                self.input_stream.truncate()  
+    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
+        self.queue.put(audio_chunk)
 
     def __get_audio_frame(self):        
         try:
-            frame = self.queue.get(block=False)
+            frame = self.queue.get(block=True,timeout=0.02)
             type = 0
-            print(f'[INFO] get frame {frame.shape}')
+            #print(f'[INFO] get frame {frame.shape}')
         except queue.Empty:
             frame = np.zeros(self.chunk, dtype=np.float32)
             type = 1
diff --git a/musereal.py b/musereal.py
index 385b47d..68cf072 100644
--- a/musereal.py
+++ b/musereal.py
@@ -21,6 +21,7 @@ from musetalk.utils.utils import get_file_type,get_video_fps,datagen
 from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder
 from musetalk.utils.blending import get_image,get_image_prepare_material,get_image_blending
 from musetalk.utils.utils import load_all_model
+from ttsreal import EdgeTTS,VoitsTTS,XTTS
 
 from museasr import MuseASR
 import asyncio
@@ -59,6 +60,13 @@ class MuseReal:
         self.__loadavatar()
 
         self.asr = MuseASR(opt,self.audio_processor)
+        if opt.tts == "edgetts":
+            self.tts = EdgeTTS(opt,self)
+        elif opt.tts == "gpt-sovits":
+            self.tts = VoitsTTS(opt,self)
+        elif opt.tts == "xtts":
+            self.tts = XTTS(opt,self)
+        #self.__warm_up()
 
     def __loadmodels(self):
         # load model weights
@@ -83,8 +91,11 @@ class MuseReal:
         self.mask_list_cycle = read_imgs(input_mask_list)
         
     
-    def push_audio(self,buffer):
-        self.asr.push_audio(buffer)
+    def put_msg_txt(self,msg):
+        self.tts.put_msg_txt(msg)
+    
+    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
+        self.asr.put_audio_frame(audio_chunk)
 
     def __mirror_index(self, index):
         size = len(self.coord_list_cycle)
@@ -93,13 +104,37 @@ class MuseReal:
         if turn % 2 == 0:
             return res
         else:
-            return size - res - 1   
+            return size - res - 1  
+
+    def __warm_up(self): 
+        self.asr.run_step()
+        whisper_chunks = self.asr.get_next_feat()
+        whisper_batch = np.stack(whisper_chunks)
+        latent_batch = []
+        for i in range(self.batch_size):
+            idx = self.__mirror_index(self.idx+i)
+            latent = self.input_latent_list_cycle[idx]
+            latent_batch.append(latent)
+        latent_batch = torch.cat(latent_batch, dim=0)
+        print('infer=======')
+        # for i, (whisper_batch,latent_batch) in enumerate(gen):
+        audio_feature_batch = torch.from_numpy(whisper_batch)
+        audio_feature_batch = audio_feature_batch.to(device=self.unet.device,
+                                                        dtype=self.unet.model.dtype)
+        audio_feature_batch = self.pe(audio_feature_batch)
+        latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+
+        pred_latents = self.unet.model(latent_batch, 
+                                    self.timesteps, 
+                                    encoder_hidden_states=audio_feature_batch).sample
+        recon = self.vae.decode_latents(pred_latents)
 
     def test_step(self,loop=None,audio_track=None,video_track=None):
         
         # gen = datagen(whisper_chunks,
         #               self.input_latent_list_cycle, 
         #               self.batch_size)
+        starttime=time.perf_counter()
         self.asr.run_step()
         whisper_chunks = self.asr.get_next_feat()
         is_all_silence=True
@@ -114,7 +149,8 @@ class MuseReal:
                 self.res_frame_queue.put((None,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2]))
                 self.idx = self.idx + 1
         else:
-            print('infer=======')
+            # print('infer=======')
+            t=time.perf_counter()
             whisper_batch = np.stack(whisper_chunks)
             latent_batch = []
             for i in range(self.batch_size):
@@ -129,16 +165,22 @@ class MuseReal:
                                                             dtype=self.unet.model.dtype)
             audio_feature_batch = self.pe(audio_feature_batch)
             latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
+            # print('prepare time:',time.perf_counter()-t)
+            # t=time.perf_counter()
 
             pred_latents = self.unet.model(latent_batch, 
                                         self.timesteps, 
                                         encoder_hidden_states=audio_feature_batch).sample
+            # print('unet time:',time.perf_counter()-t)
+            # t=time.perf_counter()
             recon = self.vae.decode_latents(pred_latents)
+            # print('vae time:',time.perf_counter()-t)
             #print('diffusion len=',len(recon))
             for i,res_frame in enumerate(recon):
                 #self.__pushmedia(res_frame,loop,audio_track,video_track)
                 self.res_frame_queue.put((res_frame,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2]))
                 self.idx = self.idx + 1
+            print('total batch time:',time.perf_counter()-starttime)
       
 
     def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
@@ -175,12 +217,14 @@ class MuseReal:
                 new_frame.sample_rate=16000
                 # if audio_track._queue.qsize()>10:
                 #     time.sleep(0.1)
-                asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop) 
+                asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
+        print('musereal process_frames thread stop') 
             
     def render(self,quit_event,loop=None,audio_track=None,video_track=None):
         #if self.opt.asr:
         #     self.asr.warm_up()
 
+        self.tts.render(quit_event)
         process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track))
         process_thread.start()
 
@@ -207,4 +251,5 @@ class MuseReal:
             # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
             # if delay > 0:
             #     time.sleep(delay)
+        print('musereal thread stop')
             
\ No newline at end of file
diff --git a/nerfreal.py b/nerfreal.py
index 3e78e85..5ee2365 100644
--- a/nerfreal.py
+++ b/nerfreal.py
@@ -10,6 +10,8 @@ import torch.nn.functional as F
 import cv2
 
 from asrreal import ASR
+from ttsreal import EdgeTTS,VoitsTTS,XTTS
+
 import asyncio
 from av import AudioFrame, VideoFrame
 
@@ -63,6 +65,12 @@ class NeRFReal:
         if self.opt.asr:
             self.asr = ASR(opt)
             self.asr.warm_up()
+        if opt.tts == "edgetts":
+            self.tts = EdgeTTS(opt,self)
+        elif opt.tts == "gpt-sovits":
+            self.tts = VoitsTTS(opt,self)
+        elif opt.tts == "xtts":
+            self.tts = XTTS(opt,self)
         
         '''
         video_path = 'video_stream'
@@ -110,8 +118,11 @@ class NeRFReal:
         if self.opt.asr:
             self.asr.stop()
 
-    def push_audio(self,chunk):
-        self.asr.push_audio(chunk)   
+    def put_msg_txt(self,msg):
+        self.tts.put_msg_txt(msg)
+
+    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
+        self.asr.put_audio_frame(audio_chunk)   
     
 
     def mirror_index(self, index):
@@ -231,6 +242,8 @@ class NeRFReal:
         totaltime=0
         _starttime=time.perf_counter()
         _totalframe=0
+
+        self.tts.render(quit_event)
         while not quit_event.is_set(): #todo
             # update texture every frame
             # audio stream thread...
@@ -255,5 +268,6 @@ class NeRFReal:
                 if video_track._queue.qsize()>=5:
                     #print('sleep qsize=',video_track._queue.qsize())
                     time.sleep(0.1)
+        print('nerfreal thread stop')
             
             
\ No newline at end of file
diff --git a/ttsreal.py b/ttsreal.py
new file mode 100644
index 0000000..e09acad
--- /dev/null
+++ b/ttsreal.py
@@ -0,0 +1,223 @@
+import time
+import numpy as np
+import soundfile as sf
+import resampy
+import asyncio
+import edge_tts
+
+from typing import Iterator
+
+import requests
+
+import queue
+from queue import Queue
+from io import BytesIO
+from threading import Thread, Event
+
+class BaseTTS:
+    def __init__(self, opt, parent):
+        self.opt=opt
+        self.parent = parent
+
+        self.fps = opt.fps # 20 ms per frame
+        self.sample_rate = 16000
+        self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
+        self.input_stream = BytesIO()
+
+        self.msgqueue = Queue()
+
+    def put_msg_txt(self,msg): 
+        self.msgqueue.put(msg)
+
+    def render(self,quit_event):
+        process_thread = Thread(target=self.process_tts, args=(quit_event,))
+        process_thread.start()
+    
+    def process_tts(self,quit_event):        
+        while not quit_event.is_set():
+            try:
+                msg = self.msgqueue.get(block=True, timeout=1)
+            except queue.Empty:
+                continue
+            self.txt_to_audio(msg)
+        print('ttsreal thread stop')
+    
+    def txt_to_audio(self,msg):
+        pass
+    
+
+###########################################################################################
+class EdgeTTS(BaseTTS):
+    def txt_to_audio(self,msg):
+        voicename = "zh-CN-YunxiaNeural"
+        text = msg
+        t = time.time()
+        asyncio.new_event_loop().run_until_complete(self.__main(voicename,text))
+        print(f'-------edge tts time:{time.time()-t:.4f}s')
+
+        self.input_stream.seek(0)
+        stream = self.__create_bytes_stream(self.input_stream)
+        streamlen = stream.shape[0]
+        idx=0
+        while streamlen >= self.chunk:
+            self.parent.put_audio_frame(stream[idx:idx+self.chunk])
+            streamlen -= self.chunk
+            idx += self.chunk
+        #if streamlen>0:  #skip last frame(not 20ms)
+        #    self.queue.put(stream[idx:])
+        self.input_stream.seek(0)
+        self.input_stream.truncate() 
+
+    def __create_bytes_stream(self,byte_stream):
+        #byte_stream=BytesIO(buffer)
+        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
+        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
+        stream = stream.astype(np.float32)
+
+        if stream.ndim > 1:
+            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
+            stream = stream[:, 0]
+    
+        if sample_rate != self.sample_rate and stream.shape[0]>0:
+            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
+            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
+
+        return stream
+    
+    async def __main(self,voicename: str, text: str):
+        communicate = edge_tts.Communicate(text, voicename)
+
+        #with open(OUTPUT_FILE, "wb") as file:
+        first = True
+        async for chunk in communicate.stream():
+            if first:
+                first = False
+            if chunk["type"] == "audio":
+                #self.push_audio(chunk["data"])
+                self.input_stream.write(chunk["data"])
+                #file.write(chunk["data"])
+            elif chunk["type"] == "WordBoundary":
+                pass
+
+###########################################################################################
+class VoitsTTS(BaseTTS):
+    def txt_to_audio(self,msg): 
+        self.stream_tts(
+            self.gpt_sovits(
+                msg,
+                self.opt.CHARACTER, #"test", #character
+                "zh", #en args.language,
+                self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
+                self.opt.EMOTION, #emotion 
+            )
+        )
+
+    def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
+        start = time.perf_counter()
+        req={}
+        req["text"] = text
+        req["text_language"] = language
+        req["character"] = character
+        req["emotion"] = emotion
+        #req["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
+        req["stream"] = True
+        res = requests.post(
+            f"{server_url}/tts",
+            json=req,
+            stream=True,
+        )
+        end = time.perf_counter()
+        print(f"gpt_sovits Time to make POST: {end-start}s")
+
+        if res.status_code != 200:
+            print("Error:", res.text)
+            return
+            
+        first = True
+        for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
+            if first:
+                end = time.perf_counter()
+                print(f"gpt_sovits Time to first chunk: {end-start}s")
+                first = False
+            if chunk:
+                yield chunk
+
+        print("gpt_sovits response.elapsed:", res.elapsed)
+
+    def stream_tts(self,audio_stream):
+        for chunk in audio_stream:
+            if chunk is not None and len(chunk)>0:          
+                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
+                stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
+                #byte_stream=BytesIO(buffer)
+                #stream = self.__create_bytes_stream(byte_stream)
+                streamlen = stream.shape[0]
+                idx=0
+                while streamlen >= self.chunk:
+                    self.parent.put_audio_frame(stream[idx:idx+self.chunk])
+                    streamlen -= self.chunk
+                    idx += self.chunk 
+
+###########################################################################################
+class XTTS(BaseTTS):
+    def __init__(self, opt, parent):
+        super().__init__(opt,parent)
+        self.speaker = self.get_speaker(opt.REF_FILE, opt.TTS_SERVER)
+
+    def txt_to_audio(self,msg): 
+        self.stream_tts(
+            self.xtts(
+                msg,
+                self.speaker,
+                "zh-cn", #en args.language,
+                self.opt.TTS_SERVER, #"http://localhost:9000", #args.server_url,
+                "20" #args.stream_chunk_size
+            )
+        )
+
+    def get_speaker(self,ref_audio,server_url):
+        files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
+        response = requests.post(f"{server_url}/clone_speaker", files=files)
+        return response.json()
+
+    def xtts(self,text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
+        start = time.perf_counter()
+        speaker["text"] = text
+        speaker["language"] = language
+        speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
+        res = requests.post(
+            f"{server_url}/tts_stream",
+            json=speaker,
+            stream=True,
+        )
+        end = time.perf_counter()
+        print(f"xtts Time to make POST: {end-start}s")
+
+        if res.status_code != 200:
+            print("Error:", res.text)
+            return
+
+        first = True
+        for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
+            if first:
+                end = time.perf_counter()
+                print(f"xtts Time to first chunk: {end-start}s")
+                first = False
+            if chunk:
+                yield chunk
+
+        print("xtts response.elapsed:", res.elapsed)
+    
+    def stream_tts(self,audio_stream):
+        for chunk in audio_stream:
+            if chunk is not None and len(chunk)>0:          
+                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
+                stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
+                #byte_stream=BytesIO(buffer)
+                #stream = self.__create_bytes_stream(byte_stream)
+                streamlen = stream.shape[0]
+                idx=0
+                while streamlen >= self.chunk:
+                    self.parent.put_audio_frame(stream[idx:idx+self.chunk])
+                    streamlen -= self.chunk
+                    idx += self.chunk 
\ No newline at end of file