diff --git a/basereal.py b/basereal.py
index 60fb952..67449d3 100644
--- a/basereal.py
+++ b/basereal.py
@@ -18,7 +18,7 @@ import soundfile as sf
 import av
 from fractions import Fraction
 
-from ttsreal import EdgeTTS,VoitsTTS,XTTS
+from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS
 
 from tqdm import tqdm
 def read_imgs(img_list):
@@ -41,6 +41,8 @@ class BaseReal:
             self.tts = VoitsTTS(opt,self)
         elif opt.tts == "xtts":
             self.tts = XTTS(opt,self)
+        elif opt.tts == "cosyvoice":
+            self.tts = CosyVoiceTTS(opt,self)
         
         self.recording = False
         self.recordq_video = Queue()
diff --git a/nerfreal.py b/nerfreal.py
index 1a4e1d0..10a221a 100644
--- a/nerfreal.py
+++ b/nerfreal.py
@@ -17,6 +17,8 @@ import asyncio
 from av import AudioFrame, VideoFrame
 from basereal import BaseReal
 
+#from imgcache import ImgCache
+
 from tqdm import tqdm
 def read_imgs(img_list):
     frames = []
@@ -60,6 +62,7 @@ class NeRFReal(BaseReal):
             input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
             #print('input_img_list:',input_img_list)
             self.fullbody_list_cycle = read_imgs(input_img_list[:frame_total_num])
+            #self.imagecache = ImgCache(frame_total_num,self.opt.fullbody_img,1000)
 
         #self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
         #self.need_update = True # camera moved, should reset accumulation
@@ -225,7 +228,8 @@ class NeRFReal(BaseReal):
                 #print("frame index:",data['index'])
                 #image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
                 image_fullbody = self.fullbody_list_cycle[data['index'][0]]
-                image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
+                #image_fullbody = self.imagecache.get_img(data['index'][0])
+                image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)                
                 start_x = self.opt.fullbody_offset_x  # 合并后小图片的起始x坐标
                 start_y = self.opt.fullbody_offset_y  # 合并后小图片的起始y坐标
                 image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
diff --git a/ttsreal.py b/ttsreal.py
index f29e2ba..cff9fc3 100644
--- a/ttsreal.py
+++ b/ttsreal.py
@@ -156,7 +156,7 @@ class VoitsTTS(BaseTTS):
             return
             
         first = True
-        for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
+        for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2
             if first:
                 end = time.perf_counter()
                 print(f"gpt_sovits Time to first chunk: {end-start}s")
@@ -180,6 +180,60 @@ class VoitsTTS(BaseTTS):
                     streamlen -= self.chunk
                     idx += self.chunk 
 
+###########################################################################################
+class CosyVoiceTTS(BaseTTS):
+    def txt_to_audio(self,msg): 
+        self.stream_tts(
+            self.cosy_voice(
+                msg,
+                self.opt.REF_FILE,  
+                self.opt.REF_TEXT,
+                "zh", #en args.language,
+                self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
+            )
+        )
+
+    def cosy_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
+        start = time.perf_counter()
+        payload = {
+            'tts_text': text,
+            'prompt_text': reftext
+        }
+        files = [('prompt_wav', ('prompt_wav', open(reffile, 'rb'), 'application/octet-stream'))]
+        res = requests.request("GET", f"{server_url}/inference_zero_shot", data=payload, files=files, stream=True)
+        
+        end = time.perf_counter()
+        print(f"cosy_voice Time to make POST: {end-start}s")
+
+        if res.status_code != 200:
+            print("Error:", res.text)
+            return
+            
+        first = True
+        for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2
+            if first:
+                end = time.perf_counter()
+                print(f"cosy_voice Time to first chunk: {end-start}s")
+                first = False
+            if chunk and self.state==State.RUNNING:
+                yield chunk
+
+        print("cosy_voice response.elapsed:", res.elapsed)
+
+    def stream_tts(self,audio_stream):
+        for chunk in audio_stream:
+            if chunk is not None and len(chunk)>0:          
+                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
+                stream = resampy.resample(x=stream, sr_orig=22050, sr_new=self.sample_rate)
+                #byte_stream=BytesIO(buffer)
+                #stream = self.__create_bytes_stream(byte_stream)
+                streamlen = stream.shape[0]
+                idx=0
+                while streamlen >= self.chunk:
+                    self.parent.put_audio_frame(stream[idx:idx+self.chunk])
+                    streamlen -= self.chunk
+                    idx += self.chunk 
+
 ###########################################################################################
 class XTTS(BaseTTS):
     def __init__(self, opt, parent):