add tts cosyvoice

This commit is contained in:
lipku 2024-09-08 12:13:33 +08:00
parent 275af1ed9e
commit f584cb25d1
3 changed files with 63 additions and 3 deletions

View File

@ -18,7 +18,7 @@ import soundfile as sf
import av import av
from fractions import Fraction from fractions import Fraction
from ttsreal import EdgeTTS,VoitsTTS,XTTS from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS
from tqdm import tqdm from tqdm import tqdm
def read_imgs(img_list): def read_imgs(img_list):
@ -41,6 +41,8 @@ class BaseReal:
self.tts = VoitsTTS(opt,self) self.tts = VoitsTTS(opt,self)
elif opt.tts == "xtts": elif opt.tts == "xtts":
self.tts = XTTS(opt,self) self.tts = XTTS(opt,self)
elif opt.tts == "cosyvoice":
self.tts = CosyVoiceTTS(opt,self)
self.recording = False self.recording = False
self.recordq_video = Queue() self.recordq_video = Queue()

View File

@ -17,6 +17,8 @@ import asyncio
from av import AudioFrame, VideoFrame from av import AudioFrame, VideoFrame
from basereal import BaseReal from basereal import BaseReal
#from imgcache import ImgCache
from tqdm import tqdm from tqdm import tqdm
def read_imgs(img_list): def read_imgs(img_list):
frames = [] frames = []
@ -60,6 +62,7 @@ class NeRFReal(BaseReal):
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
#print('input_img_list:',input_img_list) #print('input_img_list:',input_img_list)
self.fullbody_list_cycle = read_imgs(input_img_list[:frame_total_num]) self.fullbody_list_cycle = read_imgs(input_img_list[:frame_total_num])
#self.imagecache = ImgCache(frame_total_num,self.opt.fullbody_img,1000)
#self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32) #self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
#self.need_update = True # camera moved, should reset accumulation #self.need_update = True # camera moved, should reset accumulation
@ -225,6 +228,7 @@ class NeRFReal(BaseReal):
#print("frame index:",data['index']) #print("frame index:",data['index'])
#image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg')) #image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
image_fullbody = self.fullbody_list_cycle[data['index'][0]] image_fullbody = self.fullbody_list_cycle[data['index'][0]]
#image_fullbody = self.imagecache.get_img(data['index'][0])
image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB) image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标 start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标
start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标 start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标

View File

@ -156,7 +156,7 @@ class VoitsTTS(BaseTTS):
return return
first = True first = True
for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2 for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2
if first: if first:
end = time.perf_counter() end = time.perf_counter()
print(f"gpt_sovits Time to first chunk: {end-start}s") print(f"gpt_sovits Time to first chunk: {end-start}s")
@ -180,6 +180,60 @@ class VoitsTTS(BaseTTS):
streamlen -= self.chunk streamlen -= self.chunk
idx += self.chunk idx += self.chunk
###########################################################################################
class CosyVoiceTTS(BaseTTS):
def txt_to_audio(self,msg):
self.stream_tts(
self.cosy_voice(
msg,
self.opt.REF_FILE,
self.opt.REF_TEXT,
"zh", #en args.language,
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
)
)
def cosy_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
start = time.perf_counter()
payload = {
'tts_text': text,
'prompt_text': reftext
}
files = [('prompt_wav', ('prompt_wav', open(reffile, 'rb'), 'application/octet-stream'))]
res = requests.request("GET", f"{server_url}/inference_zero_shot", data=payload, files=files, stream=True)
end = time.perf_counter()
print(f"cosy_voice Time to make POST: {end-start}s")
if res.status_code != 200:
print("Error:", res.text)
return
first = True
for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2
if first:
end = time.perf_counter()
print(f"cosy_voice Time to first chunk: {end-start}s")
first = False
if chunk and self.state==State.RUNNING:
yield chunk
print("cosy_voice response.elapsed:", res.elapsed)
def stream_tts(self,audio_stream):
for chunk in audio_stream:
if chunk is not None and len(chunk)>0:
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
stream = resampy.resample(x=stream, sr_orig=22050, sr_new=self.sample_rate)
#byte_stream=BytesIO(buffer)
#stream = self.__create_bytes_stream(byte_stream)
streamlen = stream.shape[0]
idx=0
while streamlen >= self.chunk:
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
streamlen -= self.chunk
idx += self.chunk
########################################################################################### ###########################################################################################
class XTTS(BaseTTS): class XTTS(BaseTTS):
def __init__(self, opt, parent): def __init__(self, opt, parent):