Refactoring tts code
This commit is contained in:
parent
4e355e9ab9
commit
632409da1e
11
README.md
11
README.md
|
@ -1,5 +1,5 @@
|
|||
A streaming digital human based on the Ernerf model, realize audio video synchronous dialogue. It can basically achieve commercial effects.
|
||||
基于ernerf模型的流式数字人,实现音视频同步对话。基本可以达到商用效果
|
||||
Real time interactive streaming digital human, realize audio video synchronous dialogue. It can basically achieve commercial effects.
|
||||
实时交互流式数字人,实现音视频同步对话。基本可以达到商用效果
|
||||
|
||||
[ernerf效果](https://www.bilibili.com/video/BV1PM4m1y7Q2/) [musetalk效果](https://www.bilibili.com/video/BV1gm421N7vQ/)
|
||||
|
||||
|
@ -23,17 +23,17 @@ conda create -n nerfstream python=3.10
|
|||
conda activate nerfstream
|
||||
conda install pytorch==1.12.1 torchvision==0.13.1 cudatoolkit=11.3 -c pytorch
|
||||
pip install -r requirements.txt
|
||||
#如果只用musetalk模型,不需要安装下面的库
|
||||
pip install "git+https://github.com/facebookresearch/pytorch3d.git"
|
||||
pip install tensorflow-gpu==2.8.0
|
||||
pip install --upgrade "protobuf<=3.20.1"
|
||||
pip install --upgrade "edge-tts<=6.1.11"
|
||||
```
|
||||
安装常见问题[FAQ](/assets/faq.md)
|
||||
linux cuda环境搭建可以参考这篇文章 https://zhuanlan.zhihu.com/p/674972886
|
||||
|
||||
|
||||
## 2. Quick Start
|
||||
默认采用webrtc推流到srs
|
||||
默认采用ernerf模型,webrtc推流到srs
|
||||
### 2.1 运行rtmpserver (srs)
|
||||
```
|
||||
export CANDIDATE='<服务器外网ip>'
|
||||
|
@ -212,7 +212,8 @@ docker版本已经不是最新代码,可以作为一个空环境,把最新
|
|||
- [ ] SyncTalk
|
||||
|
||||
如果本项目对你有帮助,帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。
|
||||
Email: lipku@foxmail.com
|
||||
知识星球: https://t.zsxq.com/7NMyO
|
||||
微信公众号:数字人技术
|
||||

|
||||
Buy me a coffee
|
||||

|
||||
|
|
169
app.py
169
app.py
|
@ -23,132 +23,11 @@ import argparse
|
|||
|
||||
import shutil
|
||||
import asyncio
|
||||
import edge_tts
|
||||
from typing import Iterator
|
||||
|
||||
import requests
|
||||
|
||||
app = Flask(__name__)
|
||||
sockets = Sockets(app)
|
||||
global nerfreal
|
||||
global tts_type
|
||||
global gspeaker
|
||||
|
||||
|
||||
async def main(voicename: str, text: str, render):
|
||||
communicate = edge_tts.Communicate(text, voicename)
|
||||
|
||||
#with open(OUTPUT_FILE, "wb") as file:
|
||||
first = True
|
||||
async for chunk in communicate.stream():
|
||||
if first:
|
||||
#render.before_push_audio()
|
||||
first = False
|
||||
if chunk["type"] == "audio":
|
||||
render.push_audio(chunk["data"])
|
||||
#file.write(chunk["data"])
|
||||
elif chunk["type"] == "WordBoundary":
|
||||
pass
|
||||
|
||||
def get_speaker(ref_audio,server_url):
|
||||
files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
|
||||
response = requests.post(f"{server_url}/clone_speaker", files=files)
|
||||
return response.json()
|
||||
|
||||
def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
||||
start = time.perf_counter()
|
||||
speaker["text"] = text
|
||||
speaker["language"] = language
|
||||
speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||
res = requests.post(
|
||||
f"{server_url}/tts_stream",
|
||||
json=speaker,
|
||||
stream=True,
|
||||
)
|
||||
end = time.perf_counter()
|
||||
print(f"xtts Time to make POST: {end-start}s")
|
||||
|
||||
if res.status_code != 200:
|
||||
print("Error:", res.text)
|
||||
return
|
||||
|
||||
first = True
|
||||
for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
|
||||
if first:
|
||||
end = time.perf_counter()
|
||||
print(f"xtts Time to first chunk: {end-start}s")
|
||||
first = False
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
print("xtts response.elapsed:", res.elapsed)
|
||||
|
||||
def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
|
||||
start = time.perf_counter()
|
||||
req={}
|
||||
req["text"] = text
|
||||
req["text_language"] = language
|
||||
req["character"] = character
|
||||
req["emotion"] = emotion
|
||||
#req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||
req["stream"] = True
|
||||
res = requests.post(
|
||||
f"{server_url}/tts",
|
||||
json=req,
|
||||
stream=True,
|
||||
)
|
||||
end = time.perf_counter()
|
||||
print(f"gpt_sovits Time to make POST: {end-start}s")
|
||||
|
||||
if res.status_code != 200:
|
||||
print("Error:", res.text)
|
||||
return
|
||||
|
||||
first = True
|
||||
for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
|
||||
if first:
|
||||
end = time.perf_counter()
|
||||
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
||||
first = False
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
print("gpt_sovits response.elapsed:", res.elapsed)
|
||||
|
||||
def stream_tts(audio_stream,render):
|
||||
for chunk in audio_stream:
|
||||
if chunk is not None:
|
||||
render.push_audio(chunk)
|
||||
|
||||
def txt_to_audio(text_):
|
||||
if tts_type == "edgetts":
|
||||
voicename = "zh-CN-YunxiaNeural"
|
||||
text = text_
|
||||
t = time.time()
|
||||
asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
||||
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
||||
elif tts_type == "gpt-sovits": #gpt_sovits
|
||||
stream_tts(
|
||||
gpt_sovits(
|
||||
text_,
|
||||
app.config['CHARACTER'], #"test", #character
|
||||
"zh", #en args.language,
|
||||
app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
|
||||
app.config['EMOTION'], #emotion
|
||||
),
|
||||
nerfreal
|
||||
)
|
||||
else: #xtts
|
||||
stream_tts(
|
||||
xtts(
|
||||
text_,
|
||||
gspeaker,
|
||||
"zh-cn", #en args.language,
|
||||
app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
|
||||
"20" #args.stream_chunk_size
|
||||
),
|
||||
nerfreal
|
||||
)
|
||||
|
||||
|
||||
@sockets.route('/humanecho')
|
||||
|
@ -168,7 +47,7 @@ def echo_socket(ws):
|
|||
if not message or len(message)==0:
|
||||
return '输入信息为空'
|
||||
else:
|
||||
txt_to_audio(message)
|
||||
nerfreal.put_msg_txt(message)
|
||||
|
||||
|
||||
def llm_response(message):
|
||||
|
@ -198,42 +77,11 @@ def chat_socket(ws):
|
|||
return '输入信息为空'
|
||||
else:
|
||||
res=llm_response(message)
|
||||
txt_to_audio(res)
|
||||
nerfreal.put_msg_txt(res)
|
||||
|
||||
#####webrtc###############################
|
||||
pcs = set()
|
||||
|
||||
async def txt_to_audio_async(text_):
|
||||
if tts_type == "edgetts":
|
||||
voicename = "zh-CN-YunxiaNeural"
|
||||
text = text_
|
||||
t = time.time()
|
||||
#asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
||||
await main(voicename,text,nerfreal)
|
||||
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
||||
elif tts_type == "gpt-sovits": #gpt_sovits
|
||||
stream_tts(
|
||||
gpt_sovits(
|
||||
text_,
|
||||
app.config['CHARACTER'], #"test", #character
|
||||
"zh", #en args.language,
|
||||
app.config['TTS_SERVER'], #"http://127.0.0.1:5000", #args.server_url,
|
||||
app.config['EMOTION'], #emotion
|
||||
),
|
||||
nerfreal
|
||||
)
|
||||
else: #xtts
|
||||
stream_tts(
|
||||
xtts(
|
||||
text_,
|
||||
gspeaker,
|
||||
"zh-cn", #en args.language,
|
||||
app.config['TTS_SERVER'], #"http://localhost:9000", #args.server_url,
|
||||
"20" #args.stream_chunk_size
|
||||
),
|
||||
nerfreal
|
||||
)
|
||||
|
||||
#@app.route('/offer', methods=['POST'])
|
||||
async def offer(request):
|
||||
params = await request.json()
|
||||
|
@ -271,10 +119,10 @@ async def human(request):
|
|||
params = await request.json()
|
||||
|
||||
if params['type']=='echo':
|
||||
await txt_to_audio_async(params['text'])
|
||||
nerfreal.put_msg_txt(params['text'])
|
||||
elif params['type']=='chat':
|
||||
res=llm_response(params['text'])
|
||||
await txt_to_audio_async(res)
|
||||
res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text']))
|
||||
nerfreal.put_msg_txt(res)
|
||||
|
||||
return web.Response(
|
||||
content_type="application/json",
|
||||
|
@ -453,14 +301,9 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--listenport', type=int, default=8010)
|
||||
|
||||
opt = parser.parse_args()
|
||||
app.config.from_object(opt)
|
||||
#app.config.from_object(opt)
|
||||
#print(app.config)
|
||||
|
||||
tts_type = opt.tts
|
||||
if tts_type == "xtts":
|
||||
print("Computing the latents for a new reference...")
|
||||
gspeaker = get_speaker(opt.REF_FILE, opt.TTS_SERVER)
|
||||
|
||||
if opt.model == 'ernerf':
|
||||
from ernerf.nerf_triplane.provider import NeRFDataset_Test
|
||||
from ernerf.nerf_triplane.utils import *
|
||||
|
|
54
asrreal.py
54
asrreal.py
|
@ -56,7 +56,6 @@ class ASR:
|
|||
|
||||
# create input stream
|
||||
self.queue = Queue()
|
||||
self.input_stream = BytesIO()
|
||||
self.output_queue = Queue()
|
||||
# start a background process to read frames
|
||||
#self.input_stream = self.audio_instance.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, output=False, frames_per_buffer=self.chunk)
|
||||
|
@ -204,6 +203,9 @@ class ASR:
|
|||
# np.save(output_path, unfold_feats.cpu().numpy())
|
||||
# print(f"[INFO] saved logits to {output_path}")
|
||||
|
||||
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
|
||||
self.queue.put(audio_chunk)
|
||||
|
||||
def __get_audio_frame(self):
|
||||
if self.inwarm: # warm up
|
||||
return np.zeros(self.chunk, dtype=np.float32),1
|
||||
|
@ -260,56 +262,6 @@ class ASR:
|
|||
|
||||
return logits[0], None,None #predicted_ids[0], transcription # [N,]
|
||||
|
||||
def __create_bytes_stream(self,byte_stream):
|
||||
#byte_stream=BytesIO(buffer)
|
||||
stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
|
||||
print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
|
||||
stream = stream.astype(np.float32)
|
||||
|
||||
if stream.ndim > 1:
|
||||
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
|
||||
stream = stream[:, 0]
|
||||
|
||||
if sample_rate != self.sample_rate and stream.shape[0]>0:
|
||||
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
|
||||
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
|
||||
|
||||
return stream
|
||||
|
||||
def push_audio(self,buffer): #push audio pcm from tts
|
||||
print(f'[INFO] push_audio {len(buffer)}')
|
||||
if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits":
|
||||
if len(buffer)>0:
|
||||
stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
|
||||
if self.opt.tts == "xtts":
|
||||
stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
||||
else:
|
||||
stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
|
||||
#byte_stream=BytesIO(buffer)
|
||||
#stream = self.__create_bytes_stream(byte_stream)
|
||||
streamlen = stream.shape[0]
|
||||
idx=0
|
||||
while streamlen >= self.chunk:
|
||||
self.queue.put(stream[idx:idx+self.chunk])
|
||||
streamlen -= self.chunk
|
||||
idx += self.chunk
|
||||
# if streamlen>0: #skip last frame(not 20ms)
|
||||
# self.queue.put(stream[idx:])
|
||||
else: #edge tts
|
||||
self.input_stream.write(buffer)
|
||||
if len(buffer)<=0:
|
||||
self.input_stream.seek(0)
|
||||
stream = self.__create_bytes_stream(self.input_stream)
|
||||
streamlen = stream.shape[0]
|
||||
idx=0
|
||||
while streamlen >= self.chunk:
|
||||
self.queue.put(stream[idx:idx+self.chunk])
|
||||
streamlen -= self.chunk
|
||||
idx += self.chunk
|
||||
#if streamlen>0: #skip last frame(not 20ms)
|
||||
# self.queue.put(stream[idx:])
|
||||
self.input_stream.seek(0)
|
||||
self.input_stream.truncate()
|
||||
|
||||
def get_audio_out(self): #get origin audio pcm to nerf
|
||||
return self.output_queue.get()
|
||||
|
|
58
museasr.py
58
museasr.py
|
@ -18,7 +18,7 @@ class MuseASR:
|
|||
self.sample_rate = 16000
|
||||
self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
|
||||
self.queue = Queue()
|
||||
self.input_stream = BytesIO()
|
||||
# self.input_stream = BytesIO()
|
||||
self.output_queue = Queue()
|
||||
|
||||
self.audio_processor = audio_processor
|
||||
|
@ -29,62 +29,14 @@ class MuseASR:
|
|||
|
||||
self.warm_up()
|
||||
|
||||
def __create_bytes_stream(self,byte_stream):
|
||||
#byte_stream=BytesIO(buffer)
|
||||
stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
|
||||
print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
|
||||
stream = stream.astype(np.float32)
|
||||
|
||||
if stream.ndim > 1:
|
||||
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
|
||||
stream = stream[:, 0]
|
||||
|
||||
if sample_rate != self.sample_rate and stream.shape[0]>0:
|
||||
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
|
||||
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
|
||||
|
||||
return stream
|
||||
|
||||
def push_audio(self,buffer):
|
||||
print(f'[INFO] push_audio {len(buffer)}')
|
||||
if self.opt.tts == "xtts" or self.opt.tts == "gpt-sovits":
|
||||
if len(buffer)>0:
|
||||
stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
|
||||
if self.opt.tts == "xtts":
|
||||
stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
||||
else:
|
||||
stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
|
||||
#byte_stream=BytesIO(buffer)
|
||||
#stream = self.__create_bytes_stream(byte_stream)
|
||||
streamlen = stream.shape[0]
|
||||
idx=0
|
||||
while streamlen >= self.chunk:
|
||||
self.queue.put(stream[idx:idx+self.chunk])
|
||||
streamlen -= self.chunk
|
||||
idx += self.chunk
|
||||
# if streamlen>0: #skip last frame(not 20ms)
|
||||
# self.queue.put(stream[idx:])
|
||||
else: #edge tts
|
||||
self.input_stream.write(buffer)
|
||||
if len(buffer)<=0:
|
||||
self.input_stream.seek(0)
|
||||
stream = self.__create_bytes_stream(self.input_stream)
|
||||
streamlen = stream.shape[0]
|
||||
idx=0
|
||||
while streamlen >= self.chunk:
|
||||
self.queue.put(stream[idx:idx+self.chunk])
|
||||
streamlen -= self.chunk
|
||||
idx += self.chunk
|
||||
#if streamlen>0: #skip last frame(not 20ms)
|
||||
# self.queue.put(stream[idx:])
|
||||
self.input_stream.seek(0)
|
||||
self.input_stream.truncate()
|
||||
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
|
||||
self.queue.put(audio_chunk)
|
||||
|
||||
def __get_audio_frame(self):
|
||||
try:
|
||||
frame = self.queue.get(block=False)
|
||||
frame = self.queue.get(block=True,timeout=0.02)
|
||||
type = 0
|
||||
print(f'[INFO] get frame {frame.shape}')
|
||||
#print(f'[INFO] get frame {frame.shape}')
|
||||
except queue.Empty:
|
||||
frame = np.zeros(self.chunk, dtype=np.float32)
|
||||
type = 1
|
||||
|
|
51
musereal.py
51
musereal.py
|
@ -21,6 +21,7 @@ from musetalk.utils.utils import get_file_type,get_video_fps,datagen
|
|||
from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder
|
||||
from musetalk.utils.blending import get_image,get_image_prepare_material,get_image_blending
|
||||
from musetalk.utils.utils import load_all_model
|
||||
from ttsreal import EdgeTTS,VoitsTTS,XTTS
|
||||
|
||||
from museasr import MuseASR
|
||||
import asyncio
|
||||
|
@ -59,6 +60,13 @@ class MuseReal:
|
|||
self.__loadavatar()
|
||||
|
||||
self.asr = MuseASR(opt,self.audio_processor)
|
||||
if opt.tts == "edgetts":
|
||||
self.tts = EdgeTTS(opt,self)
|
||||
elif opt.tts == "gpt-sovits":
|
||||
self.tts = VoitsTTS(opt,self)
|
||||
elif opt.tts == "xtts":
|
||||
self.tts = XTTS(opt,self)
|
||||
#self.__warm_up()
|
||||
|
||||
def __loadmodels(self):
|
||||
# load model weights
|
||||
|
@ -83,8 +91,11 @@ class MuseReal:
|
|||
self.mask_list_cycle = read_imgs(input_mask_list)
|
||||
|
||||
|
||||
def push_audio(self,buffer):
|
||||
self.asr.push_audio(buffer)
|
||||
def put_msg_txt(self,msg):
|
||||
self.tts.put_msg_txt(msg)
|
||||
|
||||
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
|
||||
self.asr.put_audio_frame(audio_chunk)
|
||||
|
||||
def __mirror_index(self, index):
|
||||
size = len(self.coord_list_cycle)
|
||||
|
@ -95,11 +106,35 @@ class MuseReal:
|
|||
else:
|
||||
return size - res - 1
|
||||
|
||||
def __warm_up(self):
|
||||
self.asr.run_step()
|
||||
whisper_chunks = self.asr.get_next_feat()
|
||||
whisper_batch = np.stack(whisper_chunks)
|
||||
latent_batch = []
|
||||
for i in range(self.batch_size):
|
||||
idx = self.__mirror_index(self.idx+i)
|
||||
latent = self.input_latent_list_cycle[idx]
|
||||
latent_batch.append(latent)
|
||||
latent_batch = torch.cat(latent_batch, dim=0)
|
||||
print('infer=======')
|
||||
# for i, (whisper_batch,latent_batch) in enumerate(gen):
|
||||
audio_feature_batch = torch.from_numpy(whisper_batch)
|
||||
audio_feature_batch = audio_feature_batch.to(device=self.unet.device,
|
||||
dtype=self.unet.model.dtype)
|
||||
audio_feature_batch = self.pe(audio_feature_batch)
|
||||
latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
|
||||
|
||||
pred_latents = self.unet.model(latent_batch,
|
||||
self.timesteps,
|
||||
encoder_hidden_states=audio_feature_batch).sample
|
||||
recon = self.vae.decode_latents(pred_latents)
|
||||
|
||||
def test_step(self,loop=None,audio_track=None,video_track=None):
|
||||
|
||||
# gen = datagen(whisper_chunks,
|
||||
# self.input_latent_list_cycle,
|
||||
# self.batch_size)
|
||||
starttime=time.perf_counter()
|
||||
self.asr.run_step()
|
||||
whisper_chunks = self.asr.get_next_feat()
|
||||
is_all_silence=True
|
||||
|
@ -114,7 +149,8 @@ class MuseReal:
|
|||
self.res_frame_queue.put((None,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2]))
|
||||
self.idx = self.idx + 1
|
||||
else:
|
||||
print('infer=======')
|
||||
# print('infer=======')
|
||||
t=time.perf_counter()
|
||||
whisper_batch = np.stack(whisper_chunks)
|
||||
latent_batch = []
|
||||
for i in range(self.batch_size):
|
||||
|
@ -129,16 +165,22 @@ class MuseReal:
|
|||
dtype=self.unet.model.dtype)
|
||||
audio_feature_batch = self.pe(audio_feature_batch)
|
||||
latent_batch = latent_batch.to(dtype=self.unet.model.dtype)
|
||||
# print('prepare time:',time.perf_counter()-t)
|
||||
# t=time.perf_counter()
|
||||
|
||||
pred_latents = self.unet.model(latent_batch,
|
||||
self.timesteps,
|
||||
encoder_hidden_states=audio_feature_batch).sample
|
||||
# print('unet time:',time.perf_counter()-t)
|
||||
# t=time.perf_counter()
|
||||
recon = self.vae.decode_latents(pred_latents)
|
||||
# print('vae time:',time.perf_counter()-t)
|
||||
#print('diffusion len=',len(recon))
|
||||
for i,res_frame in enumerate(recon):
|
||||
#self.__pushmedia(res_frame,loop,audio_track,video_track)
|
||||
self.res_frame_queue.put((res_frame,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2]))
|
||||
self.idx = self.idx + 1
|
||||
print('total batch time:',time.perf_counter()-starttime)
|
||||
|
||||
|
||||
def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
|
||||
|
@ -176,11 +218,13 @@ class MuseReal:
|
|||
# if audio_track._queue.qsize()>10:
|
||||
# time.sleep(0.1)
|
||||
asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
|
||||
print('musereal process_frames thread stop')
|
||||
|
||||
def render(self,quit_event,loop=None,audio_track=None,video_track=None):
|
||||
#if self.opt.asr:
|
||||
# self.asr.warm_up()
|
||||
|
||||
self.tts.render(quit_event)
|
||||
process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track))
|
||||
process_thread.start()
|
||||
|
||||
|
@ -207,4 +251,5 @@ class MuseReal:
|
|||
# delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
|
||||
# if delay > 0:
|
||||
# time.sleep(delay)
|
||||
print('musereal thread stop')
|
||||
|
18
nerfreal.py
18
nerfreal.py
|
@ -10,6 +10,8 @@ import torch.nn.functional as F
|
|||
import cv2
|
||||
|
||||
from asrreal import ASR
|
||||
from ttsreal import EdgeTTS,VoitsTTS,XTTS
|
||||
|
||||
import asyncio
|
||||
from av import AudioFrame, VideoFrame
|
||||
|
||||
|
@ -63,6 +65,12 @@ class NeRFReal:
|
|||
if self.opt.asr:
|
||||
self.asr = ASR(opt)
|
||||
self.asr.warm_up()
|
||||
if opt.tts == "edgetts":
|
||||
self.tts = EdgeTTS(opt,self)
|
||||
elif opt.tts == "gpt-sovits":
|
||||
self.tts = VoitsTTS(opt,self)
|
||||
elif opt.tts == "xtts":
|
||||
self.tts = XTTS(opt,self)
|
||||
|
||||
'''
|
||||
video_path = 'video_stream'
|
||||
|
@ -110,8 +118,11 @@ class NeRFReal:
|
|||
if self.opt.asr:
|
||||
self.asr.stop()
|
||||
|
||||
def push_audio(self,chunk):
|
||||
self.asr.push_audio(chunk)
|
||||
def put_msg_txt(self,msg):
|
||||
self.tts.put_msg_txt(msg)
|
||||
|
||||
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
|
||||
self.asr.put_audio_frame(audio_chunk)
|
||||
|
||||
|
||||
def mirror_index(self, index):
|
||||
|
@ -231,6 +242,8 @@ class NeRFReal:
|
|||
totaltime=0
|
||||
_starttime=time.perf_counter()
|
||||
_totalframe=0
|
||||
|
||||
self.tts.render(quit_event)
|
||||
while not quit_event.is_set(): #todo
|
||||
# update texture every frame
|
||||
# audio stream thread...
|
||||
|
@ -255,5 +268,6 @@ class NeRFReal:
|
|||
if video_track._queue.qsize()>=5:
|
||||
#print('sleep qsize=',video_track._queue.qsize())
|
||||
time.sleep(0.1)
|
||||
print('nerfreal thread stop')
|
||||
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
import time
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import resampy
|
||||
import asyncio
|
||||
import edge_tts
|
||||
|
||||
from typing import Iterator
|
||||
|
||||
import requests
|
||||
|
||||
import queue
|
||||
from queue import Queue
|
||||
from io import BytesIO
|
||||
from threading import Thread, Event
|
||||
|
||||
class BaseTTS:
|
||||
def __init__(self, opt, parent):
|
||||
self.opt=opt
|
||||
self.parent = parent
|
||||
|
||||
self.fps = opt.fps # 20 ms per frame
|
||||
self.sample_rate = 16000
|
||||
self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
|
||||
self.input_stream = BytesIO()
|
||||
|
||||
self.msgqueue = Queue()
|
||||
|
||||
def put_msg_txt(self,msg):
|
||||
self.msgqueue.put(msg)
|
||||
|
||||
def render(self,quit_event):
|
||||
process_thread = Thread(target=self.process_tts, args=(quit_event,))
|
||||
process_thread.start()
|
||||
|
||||
def process_tts(self,quit_event):
|
||||
while not quit_event.is_set():
|
||||
try:
|
||||
msg = self.msgqueue.get(block=True, timeout=1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
self.txt_to_audio(msg)
|
||||
print('ttsreal thread stop')
|
||||
|
||||
def txt_to_audio(self,msg):
|
||||
pass
|
||||
|
||||
|
||||
###########################################################################################
|
||||
class EdgeTTS(BaseTTS):
|
||||
def txt_to_audio(self,msg):
|
||||
voicename = "zh-CN-YunxiaNeural"
|
||||
text = msg
|
||||
t = time.time()
|
||||
asyncio.new_event_loop().run_until_complete(self.__main(voicename,text))
|
||||
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
||||
|
||||
self.input_stream.seek(0)
|
||||
stream = self.__create_bytes_stream(self.input_stream)
|
||||
streamlen = stream.shape[0]
|
||||
idx=0
|
||||
while streamlen >= self.chunk:
|
||||
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
||||
streamlen -= self.chunk
|
||||
idx += self.chunk
|
||||
#if streamlen>0: #skip last frame(not 20ms)
|
||||
# self.queue.put(stream[idx:])
|
||||
self.input_stream.seek(0)
|
||||
self.input_stream.truncate()
|
||||
|
||||
def __create_bytes_stream(self,byte_stream):
|
||||
#byte_stream=BytesIO(buffer)
|
||||
stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
|
||||
print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
|
||||
stream = stream.astype(np.float32)
|
||||
|
||||
if stream.ndim > 1:
|
||||
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
|
||||
stream = stream[:, 0]
|
||||
|
||||
if sample_rate != self.sample_rate and stream.shape[0]>0:
|
||||
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
|
||||
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
|
||||
|
||||
return stream
|
||||
|
||||
async def __main(self,voicename: str, text: str):
|
||||
communicate = edge_tts.Communicate(text, voicename)
|
||||
|
||||
#with open(OUTPUT_FILE, "wb") as file:
|
||||
first = True
|
||||
async for chunk in communicate.stream():
|
||||
if first:
|
||||
first = False
|
||||
if chunk["type"] == "audio":
|
||||
#self.push_audio(chunk["data"])
|
||||
self.input_stream.write(chunk["data"])
|
||||
#file.write(chunk["data"])
|
||||
elif chunk["type"] == "WordBoundary":
|
||||
pass
|
||||
|
||||
###########################################################################################
|
||||
class VoitsTTS(BaseTTS):
|
||||
def txt_to_audio(self,msg):
|
||||
self.stream_tts(
|
||||
self.gpt_sovits(
|
||||
msg,
|
||||
self.opt.CHARACTER, #"test", #character
|
||||
"zh", #en args.language,
|
||||
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
||||
self.opt.EMOTION, #emotion
|
||||
)
|
||||
)
|
||||
|
||||
def gpt_sovits(text, character, language, server_url, emotion) -> Iterator[bytes]:
|
||||
start = time.perf_counter()
|
||||
req={}
|
||||
req["text"] = text
|
||||
req["text_language"] = language
|
||||
req["character"] = character
|
||||
req["emotion"] = emotion
|
||||
#req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||
req["stream"] = True
|
||||
res = requests.post(
|
||||
f"{server_url}/tts",
|
||||
json=req,
|
||||
stream=True,
|
||||
)
|
||||
end = time.perf_counter()
|
||||
print(f"gpt_sovits Time to make POST: {end-start}s")
|
||||
|
||||
if res.status_code != 200:
|
||||
print("Error:", res.text)
|
||||
return
|
||||
|
||||
first = True
|
||||
for chunk in res.iter_content(chunk_size=32000): # 1280 32K*20ms*2
|
||||
if first:
|
||||
end = time.perf_counter()
|
||||
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
||||
first = False
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
print("gpt_sovits response.elapsed:", res.elapsed)
|
||||
|
||||
def stream_tts(self,audio_stream):
|
||||
for chunk in audio_stream:
|
||||
if chunk is not None and len(chunk)>0:
|
||||
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
||||
stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
|
||||
#byte_stream=BytesIO(buffer)
|
||||
#stream = self.__create_bytes_stream(byte_stream)
|
||||
streamlen = stream.shape[0]
|
||||
idx=0
|
||||
while streamlen >= self.chunk:
|
||||
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
||||
streamlen -= self.chunk
|
||||
idx += self.chunk
|
||||
|
||||
###########################################################################################
|
||||
class XTTS(BaseTTS):
|
||||
def __init__(self, opt, parent):
|
||||
super().__init__(opt,parent)
|
||||
self.speaker = self.get_speaker(opt.REF_FILE, opt.TTS_SERVER)
|
||||
|
||||
def txt_to_audio(self,msg):
|
||||
self.stream_tts(
|
||||
self.xtts(
|
||||
msg,
|
||||
self.speaker,
|
||||
"zh-cn", #en args.language,
|
||||
self.opt.TTS_SERVER, #"http://localhost:9000", #args.server_url,
|
||||
"20" #args.stream_chunk_size
|
||||
)
|
||||
)
|
||||
|
||||
def get_speaker(self,ref_audio,server_url):
|
||||
files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
|
||||
response = requests.post(f"{server_url}/clone_speaker", files=files)
|
||||
return response.json()
|
||||
|
||||
def xtts(self,text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
||||
start = time.perf_counter()
|
||||
speaker["text"] = text
|
||||
speaker["language"] = language
|
||||
speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||
res = requests.post(
|
||||
f"{server_url}/tts_stream",
|
||||
json=speaker,
|
||||
stream=True,
|
||||
)
|
||||
end = time.perf_counter()
|
||||
print(f"xtts Time to make POST: {end-start}s")
|
||||
|
||||
if res.status_code != 200:
|
||||
print("Error:", res.text)
|
||||
return
|
||||
|
||||
first = True
|
||||
for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
|
||||
if first:
|
||||
end = time.perf_counter()
|
||||
print(f"xtts Time to first chunk: {end-start}s")
|
||||
first = False
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
print("xtts response.elapsed:", res.elapsed)
|
||||
|
||||
def stream_tts(self,audio_stream):
|
||||
for chunk in audio_stream:
|
||||
if chunk is not None and len(chunk)>0:
|
||||
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
||||
stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
||||
#byte_stream=BytesIO(buffer)
|
||||
#stream = self.__create_bytes_stream(byte_stream)
|
||||
streamlen = stream.shape[0]
|
||||
idx=0
|
||||
while streamlen >= self.chunk:
|
||||
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
||||
streamlen -= self.chunk
|
||||
idx += self.chunk
|
Loading…
Reference in New Issue