add tts cosyvoice
This commit is contained in:
parent
275af1ed9e
commit
f584cb25d1
|
@ -18,7 +18,7 @@ import soundfile as sf
|
||||||
import av
|
import av
|
||||||
from fractions import Fraction
|
from fractions import Fraction
|
||||||
|
|
||||||
from ttsreal import EdgeTTS,VoitsTTS,XTTS
|
from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
def read_imgs(img_list):
|
def read_imgs(img_list):
|
||||||
|
@ -41,6 +41,8 @@ class BaseReal:
|
||||||
self.tts = VoitsTTS(opt,self)
|
self.tts = VoitsTTS(opt,self)
|
||||||
elif opt.tts == "xtts":
|
elif opt.tts == "xtts":
|
||||||
self.tts = XTTS(opt,self)
|
self.tts = XTTS(opt,self)
|
||||||
|
elif opt.tts == "cosyvoice":
|
||||||
|
self.tts = CosyVoiceTTS(opt,self)
|
||||||
|
|
||||||
self.recording = False
|
self.recording = False
|
||||||
self.recordq_video = Queue()
|
self.recordq_video = Queue()
|
||||||
|
|
|
@ -17,6 +17,8 @@ import asyncio
|
||||||
from av import AudioFrame, VideoFrame
|
from av import AudioFrame, VideoFrame
|
||||||
from basereal import BaseReal
|
from basereal import BaseReal
|
||||||
|
|
||||||
|
#from imgcache import ImgCache
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
def read_imgs(img_list):
|
def read_imgs(img_list):
|
||||||
frames = []
|
frames = []
|
||||||
|
@ -60,6 +62,7 @@ class NeRFReal(BaseReal):
|
||||||
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
|
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
|
||||||
#print('input_img_list:',input_img_list)
|
#print('input_img_list:',input_img_list)
|
||||||
self.fullbody_list_cycle = read_imgs(input_img_list[:frame_total_num])
|
self.fullbody_list_cycle = read_imgs(input_img_list[:frame_total_num])
|
||||||
|
#self.imagecache = ImgCache(frame_total_num,self.opt.fullbody_img,1000)
|
||||||
|
|
||||||
#self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
|
#self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
|
||||||
#self.need_update = True # camera moved, should reset accumulation
|
#self.need_update = True # camera moved, should reset accumulation
|
||||||
|
@ -225,7 +228,8 @@ class NeRFReal(BaseReal):
|
||||||
#print("frame index:",data['index'])
|
#print("frame index:",data['index'])
|
||||||
#image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
|
#image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
|
||||||
image_fullbody = self.fullbody_list_cycle[data['index'][0]]
|
image_fullbody = self.fullbody_list_cycle[data['index'][0]]
|
||||||
image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
|
#image_fullbody = self.imagecache.get_img(data['index'][0])
|
||||||
|
image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
|
||||||
start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标
|
start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标
|
||||||
start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标
|
start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标
|
||||||
image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
|
image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
|
||||||
|
|
56
ttsreal.py
56
ttsreal.py
|
@ -156,7 +156,7 @@ class VoitsTTS(BaseTTS):
|
||||||
return
|
return
|
||||||
|
|
||||||
first = True
|
first = True
|
||||||
for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
|
for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2
|
||||||
if first:
|
if first:
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
||||||
|
@ -180,6 +180,60 @@ class VoitsTTS(BaseTTS):
|
||||||
streamlen -= self.chunk
|
streamlen -= self.chunk
|
||||||
idx += self.chunk
|
idx += self.chunk
|
||||||
|
|
||||||
|
###########################################################################################
|
||||||
|
class CosyVoiceTTS(BaseTTS):
|
||||||
|
def txt_to_audio(self,msg):
|
||||||
|
self.stream_tts(
|
||||||
|
self.cosy_voice(
|
||||||
|
msg,
|
||||||
|
self.opt.REF_FILE,
|
||||||
|
self.opt.REF_TEXT,
|
||||||
|
"zh", #en args.language,
|
||||||
|
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def cosy_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
|
||||||
|
start = time.perf_counter()
|
||||||
|
payload = {
|
||||||
|
'tts_text': text,
|
||||||
|
'prompt_text': reftext
|
||||||
|
}
|
||||||
|
files = [('prompt_wav', ('prompt_wav', open(reffile, 'rb'), 'application/octet-stream'))]
|
||||||
|
res = requests.request("GET", f"{server_url}/inference_zero_shot", data=payload, files=files, stream=True)
|
||||||
|
|
||||||
|
end = time.perf_counter()
|
||||||
|
print(f"cosy_voice Time to make POST: {end-start}s")
|
||||||
|
|
||||||
|
if res.status_code != 200:
|
||||||
|
print("Error:", res.text)
|
||||||
|
return
|
||||||
|
|
||||||
|
first = True
|
||||||
|
for chunk in res.iter_content(chunk_size=16000): # 1280 32K*20ms*2
|
||||||
|
if first:
|
||||||
|
end = time.perf_counter()
|
||||||
|
print(f"cosy_voice Time to first chunk: {end-start}s")
|
||||||
|
first = False
|
||||||
|
if chunk and self.state==State.RUNNING:
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
print("cosy_voice response.elapsed:", res.elapsed)
|
||||||
|
|
||||||
|
def stream_tts(self,audio_stream):
|
||||||
|
for chunk in audio_stream:
|
||||||
|
if chunk is not None and len(chunk)>0:
|
||||||
|
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
||||||
|
stream = resampy.resample(x=stream, sr_orig=22050, sr_new=self.sample_rate)
|
||||||
|
#byte_stream=BytesIO(buffer)
|
||||||
|
#stream = self.__create_bytes_stream(byte_stream)
|
||||||
|
streamlen = stream.shape[0]
|
||||||
|
idx=0
|
||||||
|
while streamlen >= self.chunk:
|
||||||
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
||||||
|
streamlen -= self.chunk
|
||||||
|
idx += self.chunk
|
||||||
|
|
||||||
###########################################################################################
|
###########################################################################################
|
||||||
class XTTS(BaseTTS):
|
class XTTS(BaseTTS):
|
||||||
def __init__(self, opt, parent):
|
def __init__(self, opt, parent):
|
||||||
|
|
Loading…
Reference in New Issue