diff --git a/README.md b/README.md index 2b5b559..aa52b27 100644 --- a/README.md +++ b/README.md @@ -177,12 +177,12 @@ docker run --gpus all -it --network=host --rm registry.cn-hangzhou.aliyuncs.com ``` docker版本已经不是最新代码,可以作为一个空环境,把最新代码拷进去运行。 -另外提供autodl教程: +另外提供autodl镜像: +https://www.codewithgpu.com/i/lipku/metahuman-stream/base [autodl教程](autodl/README.md) -## 5. Data flow -![](/assets/dataflow.png) -## 6. 数字人模型文件 + +## 5. 数字人模型文件 可以替换成自己训练的模型(https://github.com/Fictionarry/ER-NeRF) ```python . @@ -194,7 +194,7 @@ docker版本已经不是最新代码,可以作为一个空环境,把最新 ``` -## 7. 性能分析 +## 6. 性能分析 1. 帧率 在Tesla T4显卡上测试整体fps为18左右,如果去掉音视频编码推流,帧率在20左右。用4090显卡可以达到40多帧/秒。 优化:新开一个线程运行音视频编码推流 @@ -204,7 +204,7 @@ docker版本已经不是最新代码,可以作为一个空环境,把最新 (2)wav2vec延时0.4s,需要缓存18帧音频做计算 (3)srs转发延时,设置srs服务器减少缓冲延时。具体配置可看 https://ossrs.net/lts/zh-cn/docs/v5/doc/low-latency -## 8. TODO +## 7. TODO - [x] 添加chatgpt实现数字人对话 - [x] 声音克隆 - [x] 数字人静音时用一段视频代替 @@ -215,5 +215,4 @@ docker版本已经不是最新代码,可以作为一个空环境,把最新 知识星球: https://t.zsxq.com/7NMyO 微信公众号:数字人技术 ![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyfaiaLZGuMGQXnhLWxibpJUS2gfs8Dje6JuMY8zu2tVyU9n8Zx1yaNncvKHBMibX0ocehoITy5qQEZg/640?wxfrom=12&tp=wxpic&usePicPrefetch=1&wx_fmt=jpeg&from=appmsg) -Buy me a coffee -![](https://mmbiz.qpic.cn/sz_mmbiz_jpg/l3ZibgueFiaeyEO2TDmroXibUSeFRCB3ftThHyTgVmVYyVVyvqDxronGvoU7xzkztnwQpnM5lBgx4MSaUUrnRZwCw/640?wx_fmt=jpeg&from=appmsg) + diff --git a/app.py b/app.py index a552435..82401ad 100644 --- a/app.py +++ b/app.py @@ -163,10 +163,11 @@ async def run(push_url): await pc.setLocalDescription(await pc.createOffer()) answer = await post(push_url,pc.localDescription.sdp) await pc.setRemoteDescription(RTCSessionDescription(sdp=answer,type='answer')) -########################################## - +########################################## +# os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' +# os.environ['MULTIPROCESSING_METHOD'] = 'forkserver' if __name__ == '__main__': - + multiprocessing.set_start_method('spawn') parser = argparse.ArgumentParser() parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source") parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area") diff --git a/autodl/README.md b/autodl/README.md index 69c473b..f5cfb53 100644 --- a/autodl/README.md +++ b/autodl/README.md @@ -30,8 +30,8 @@ python app.py --listenport 6006 --transport rtcpush --push_url 'http://<阿里 ``` ### 访问 -访问的是静态的rtcpushapi.html -http:///rtcpushapi.html +访问的是静态的rtcpushapi.html +http:///rtcpushapi.html 你需要修改 项目目录中的 web/rtcpushapi.html 将 @@ -51,8 +51,7 @@ var url = "http://公网ip:1985/rtc/v1/whep/?app=live&stream=livestream" ![img.png](./img/success.png) ## 注意事项 -1.autodl 如果是个人用户需要使用官方的ssh代理工具进行端口代理,才可以访问6006 -2.基础环境镜像中如果想使用musetalk环境,还需要自己操作 -3.声音延迟需要后台优化srs的功能 -4.musetalk 暂不支持rtmp推流 但是支持rtcpush -5.musetalk 教程即将更新 \ No newline at end of file +1. autodl 如果是个人用户需要使用官方的ssh代理工具进行端口代理,才可以访问6006 +2. 声音延迟需要后台优化srs的功能 +3. musetalk 暂不支持rtmp推流 但是支持rtcpush +4. musetalk 教程即将更新 \ No newline at end of file diff --git a/museasr.py b/museasr.py index c9a3a82..251225f 100644 --- a/museasr.py +++ b/museasr.py @@ -7,6 +7,7 @@ import resampy import queue from queue import Queue from io import BytesIO +import multiprocessing as mp from musetalk.whisper.audio2feature import Audio2Feature @@ -19,13 +20,14 @@ class MuseASR: self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000) self.queue = Queue() # self.input_stream = BytesIO() - self.output_queue = Queue() + self.output_queue = mp.Queue() self.audio_processor = audio_processor self.batch_size = opt.batch_size self.stride_left_size = self.stride_right_size = 6 self.audio_feats = [] + self.feat_queue = mp.Queue(5) self.warm_up() @@ -34,7 +36,7 @@ class MuseASR: def __get_audio_frame(self): try: - frame = self.queue.get(block=True,timeout=0.02) + frame = self.queue.get(block=True,timeout=0.018) type = 0 #print(f'[INFO] get frame {frame.shape}') except queue.Empty: @@ -71,12 +73,12 @@ class MuseASR: inputs = np.concatenate(frames) # [N * chunk] whisper_feature = self.audio_processor.audio2feat(inputs) for feature in whisper_feature: - self.audio_feats.append(feature) - + self.audio_feats.append(feature) #print(f"processing audio costs {(time.time() - start_time) * 1000}ms, inputs shape:{inputs.shape} whisper_feature len:{len(whisper_feature)}") - - def get_next_feat(self): whisper_chunks = self.audio_processor.feature2chunks(feature_array=self.audio_feats,fps=self.fps/2,batch_size=self.batch_size,start=self.stride_left_size/2 ) #print(f"whisper_chunks len:{len(whisper_chunks)},self.audio_feats len:{len(self.audio_feats)},self.output_queue len:{self.output_queue.qsize()}") self.audio_feats = self.audio_feats[-(self.stride_left_size + self.stride_right_size):] - return whisper_chunks \ No newline at end of file + self.feat_queue.put(whisper_chunks) + + def get_next_feat(self,block,timeout): + return self.feat_queue.get(block,timeout) \ No newline at end of file diff --git a/musereal.py b/musereal.py index 68cf072..6174f58 100644 --- a/musereal.py +++ b/musereal.py @@ -16,9 +16,10 @@ import queue from queue import Queue from threading import Thread, Event from io import BytesIO +import multiprocessing as mp from musetalk.utils.utils import get_file_type,get_video_fps,datagen -from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder +#from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder from musetalk.utils.blending import get_image,get_image_prepare_material,get_image_blending from musetalk.utils.utils import load_all_model from ttsreal import EdgeTTS,VoitsTTS,XTTS @@ -27,6 +28,102 @@ from museasr import MuseASR import asyncio from av import AudioFrame, VideoFrame +from tqdm import tqdm +def read_imgs(img_list): + frames = [] + print('reading images...') + for img_path in tqdm(img_list): + frame = cv2.imread(img_path) + frames.append(frame) + return frames + +def __mirror_index(size, index): + #size = len(self.coord_list_cycle) + turn = index // size + res = index % size + if turn % 2 == 0: + return res + else: + return size - res - 1 + +def inference(render_event,batch_size,input_latent_list_cycle,audio_feat_queue,audio_out_queue,res_frame_queue, + vae, unet, pe,timesteps): + + # _, vae, unet, pe = load_all_model() + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # timesteps = torch.tensor([0], device=device) + # pe = pe.half() + # vae.vae = vae.vae.half() + # unet.model = unet.model.half() + + #input_latent_list_cycle = torch.load(latents_out_path) + length = len(input_latent_list_cycle) + index = 0 + count=0 + counttime=0 + print('start inference') + while True: + if render_event.is_set(): + starttime=time.perf_counter() + try: + whisper_chunks = audio_feat_queue.get(block=True, timeout=1) + except queue.Empty: + continue + is_all_silence=True + audio_frames = [] + for _ in range(batch_size*2): + frame,type = audio_out_queue.get() + audio_frames.append((frame,type)) + if type==0: + is_all_silence=False + if is_all_silence: + for i in range(batch_size): + res_frame_queue.put((None,__mirror_index(length,index),audio_frames[i*2:i*2+2])) + index = index + 1 + else: + # print('infer=======') + t=time.perf_counter() + whisper_batch = np.stack(whisper_chunks) + latent_batch = [] + for i in range(batch_size): + idx = __mirror_index(length,index+i) + latent = input_latent_list_cycle[idx] + latent_batch.append(latent) + latent_batch = torch.cat(latent_batch, dim=0) + + # for i, (whisper_batch,latent_batch) in enumerate(gen): + audio_feature_batch = torch.from_numpy(whisper_batch) + audio_feature_batch = audio_feature_batch.to(device=unet.device, + dtype=unet.model.dtype) + audio_feature_batch = pe(audio_feature_batch) + latent_batch = latent_batch.to(dtype=unet.model.dtype) + # print('prepare time:',time.perf_counter()-t) + # t=time.perf_counter() + + pred_latents = unet.model(latent_batch, + timesteps, + encoder_hidden_states=audio_feature_batch).sample + # print('unet time:',time.perf_counter()-t) + # t=time.perf_counter() + recon = vae.decode_latents(pred_latents) + # print('vae time:',time.perf_counter()-t) + #print('diffusion len=',len(recon)) + counttime += (time.perf_counter() - t) + count += batch_size + #_totalframe += 1 + if count>=100: + print(f"------actual avg infer fps:{count/counttime:.4f}") + count=0 + counttime=0 + for i,res_frame in enumerate(recon): + #self.__pushmedia(res_frame,loop,audio_track,video_track) + res_frame_queue.put((res_frame,__mirror_index(length,index),audio_frames[i*2:i*2+2])) + index = index + 1 + print('total batch time:',time.perf_counter()-starttime) + else: + time.sleep(1) + print('musereal inference processor stop') + @torch.no_grad() class MuseReal: def __init__(self, opt): @@ -55,7 +152,7 @@ class MuseReal: } self.batch_size = opt.batch_size self.idx = 0 - self.res_frame_queue = Queue() + self.res_frame_queue = mp.Queue(self.batch_size*2) self.__loadmodels() self.__loadavatar() @@ -67,6 +164,11 @@ class MuseReal: elif opt.tts == "xtts": self.tts = XTTS(opt,self) #self.__warm_up() + + self.render_event = mp.Event() + mp.Process(target=inference, args=(self.render_event,self.batch_size,self.input_latent_list_cycle, + self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue, + self.vae, self.unet, self.pe,self.timesteps)).start() def __loadmodels(self): # load model weights @@ -128,59 +230,6 @@ class MuseReal: self.timesteps, encoder_hidden_states=audio_feature_batch).sample recon = self.vae.decode_latents(pred_latents) - - def test_step(self,loop=None,audio_track=None,video_track=None): - - # gen = datagen(whisper_chunks, - # self.input_latent_list_cycle, - # self.batch_size) - starttime=time.perf_counter() - self.asr.run_step() - whisper_chunks = self.asr.get_next_feat() - is_all_silence=True - audio_frames = [] - for _ in range(self.batch_size*2): - frame,type = self.asr.get_audio_out() - audio_frames.append((frame,type)) - if type==0: - is_all_silence=False - if is_all_silence: - for i in range(self.batch_size): - self.res_frame_queue.put((None,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2])) - self.idx = self.idx + 1 - else: - # print('infer=======') - t=time.perf_counter() - whisper_batch = np.stack(whisper_chunks) - latent_batch = [] - for i in range(self.batch_size): - idx = self.__mirror_index(self.idx+i) - latent = self.input_latent_list_cycle[idx] - latent_batch.append(latent) - latent_batch = torch.cat(latent_batch, dim=0) - - # for i, (whisper_batch,latent_batch) in enumerate(gen): - audio_feature_batch = torch.from_numpy(whisper_batch) - audio_feature_batch = audio_feature_batch.to(device=self.unet.device, - dtype=self.unet.model.dtype) - audio_feature_batch = self.pe(audio_feature_batch) - latent_batch = latent_batch.to(dtype=self.unet.model.dtype) - # print('prepare time:',time.perf_counter()-t) - # t=time.perf_counter() - - pred_latents = self.unet.model(latent_batch, - self.timesteps, - encoder_hidden_states=audio_feature_batch).sample - # print('unet time:',time.perf_counter()-t) - # t=time.perf_counter() - recon = self.vae.decode_latents(pred_latents) - # print('vae time:',time.perf_counter()-t) - #print('diffusion len=',len(recon)) - for i,res_frame in enumerate(recon): - #self.__pushmedia(res_frame,loop,audio_track,video_track) - self.res_frame_queue.put((res_frame,self.__mirror_index(self.idx),audio_frames[i*2:i*2+2])) - self.idx = self.idx + 1 - print('total batch time:',time.perf_counter()-starttime) def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): @@ -203,7 +252,9 @@ class MuseReal: mask = self.mask_list_cycle[idx] mask_crop_box = self.mask_coords_list_cycle[idx] #combine_frame = get_image(ori_frame,res_frame,bbox) + #t=time.perf_counter() combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box) + #print('blending time:',time.perf_counter()-t) image = combine_frame #(outputs['image'] * 255).astype(np.uint8) new_frame = VideoFrame.from_ndarray(image, format="bgr24") @@ -228,6 +279,7 @@ class MuseReal: process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track)) process_thread.start() + self.render_event.set() #start infer process render count=0 totaltime=0 _starttime=time.perf_counter() @@ -236,20 +288,21 @@ class MuseReal: # update texture every frame # audio stream thread... t = time.perf_counter() - self.test_step(loop,audio_track,video_track) - totaltime += (time.perf_counter() - t) - count += self.opt.batch_size - #_totalframe += 1 - if count>=100: - print(f"------actual avg infer fps:{count/totaltime:.4f}") - count=0 - totaltime=0 + self.asr.run_step() + #self.test_step(loop,audio_track,video_track) + # totaltime += (time.perf_counter() - t) + # count += self.opt.batch_size + # if count>=100: + # print(f"------actual avg infer fps:{count/totaltime:.4f}") + # count=0 + # totaltime=0 if video_track._queue.qsize()>=2*self.opt.batch_size: - #print('sleep qsize=',video_track._queue.qsize()) + print('sleep qsize=',video_track._queue.qsize()) time.sleep(0.04*self.opt.batch_size*1.5) # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms # if delay > 0: # time.sleep(delay) + self.render_event.clear() #end infer process render print('musereal thread stop') \ No newline at end of file