diff --git a/basereal.py b/basereal.py index 60fb952..67449d3 100644 --- a/basereal.py +++ b/basereal.py @@ -18,7 +18,7 @@ import soundfile as sf import av from fractions import Fraction -from ttsreal import EdgeTTS,VoitsTTS,XTTS +from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS from tqdm import tqdm def read_imgs(img_list): @@ -41,6 +41,8 @@ class BaseReal: self.tts = VoitsTTS(opt,self) elif opt.tts == "xtts": self.tts = XTTS(opt,self) + elif opt.tts == "cosyvoice": + self.tts = CosyVoiceTTS(opt,self) self.recording = False self.recordq_video = Queue() diff --git a/nerfreal.py b/nerfreal.py index 1a4e1d0..10a221a 100644 --- a/nerfreal.py +++ b/nerfreal.py @@ -17,6 +17,8 @@ import asyncio from av import AudioFrame, VideoFrame from basereal import BaseReal +#from imgcache import ImgCache + from tqdm import tqdm def read_imgs(img_list): frames = [] @@ -60,6 +62,7 @@ class NeRFReal(BaseReal): input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) #print('input_img_list:',input_img_list) self.fullbody_list_cycle = read_imgs(input_img_list[:frame_total_num]) + #self.imagecache = ImgCache(frame_total_num,self.opt.fullbody_img,1000) #self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32) #self.need_update = True # camera moved, should reset accumulation @@ -225,7 +228,8 @@ class NeRFReal(BaseReal): #print("frame index:",data['index']) #image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg')) image_fullbody = self.fullbody_list_cycle[data['index'][0]] - image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB) + #image_fullbody = self.imagecache.get_img(data['index'][0]) + image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB) start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标 start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标 image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image diff --git a/ttsreal.py b/ttsreal.py index f29e2ba..cff9fc3 100644 --- a/ttsreal.py +++ b/ttsreal.py @@ -156,7 +156,7 @@ class VoitsTTS(BaseTTS): return first = True - for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2 + for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2 if first: end = time.perf_counter() print(f"gpt_sovits Time to first chunk: {end-start}s") @@ -180,6 +180,60 @@ class VoitsTTS(BaseTTS): streamlen -= self.chunk idx += self.chunk +########################################################################################### +class CosyVoiceTTS(BaseTTS): + def txt_to_audio(self,msg): + self.stream_tts( + self.cosy_voice( + msg, + self.opt.REF_FILE, + self.opt.REF_TEXT, + "zh", #en args.language, + self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url, + ) + ) + + def cosy_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]: + start = time.perf_counter() + payload = { + 'tts_text': text, + 'prompt_text': reftext + } + files = [('prompt_wav', ('prompt_wav', open(reffile, 'rb'), 'application/octet-stream'))] + res = requests.request("GET", f"{server_url}/inference_zero_shot", data=payload, files=files, stream=True) + + end = time.perf_counter() + print(f"cosy_voice Time to make POST: {end-start}s") + + if res.status_code != 200: + print("Error:", res.text) + return + + first = True + for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2 + if first: + end = time.perf_counter() + print(f"cosy_voice Time to first chunk: {end-start}s") + first = False + if chunk and self.state==State.RUNNING: + yield chunk + + print("cosy_voice response.elapsed:", res.elapsed) + + def stream_tts(self,audio_stream): + for chunk in audio_stream: + if chunk is not None and len(chunk)>0: + stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767 + stream = resampy.resample(x=stream, sr_orig=22050, sr_new=self.sample_rate) + #byte_stream=BytesIO(buffer) + #stream = self.__create_bytes_stream(byte_stream) + streamlen = stream.shape[0] + idx=0 + while streamlen >= self.chunk: + self.parent.put_audio_frame(stream[idx:idx+self.chunk]) + streamlen -= self.chunk + idx += self.chunk + ########################################################################################### class XTTS(BaseTTS): def __init__(self, opt, parent):