diff --git a/README.md b/README.md index cde84a0..946aad7 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ python app.py export HF_ENDPOINT=https://hf-mirror.com ``` -用浏览器打开http://serverip:8010/rtcpush.html, 在文本框输入任意文字,提交。数字人播报该段文字 +用浏览器打开http://serverip:8010/rtcpushapi.html, 在文本框输入任意文字,提交。数字人播报该段文字 备注:服务端需要开放端口 tcp:8000,8010,1985; udp:8000 ## 3. More Usage @@ -128,7 +128,7 @@ python app.py --customvideo --customvideo_img data/customvideo/img --customvideo ``` python app.py --transport webrtc ``` -用浏览器打开http://serverip:8010/webrtc.html +用浏览器打开http://serverip:8010/webrtcapi.html ### 3.8 rtmp推送到srs - 安装rtmpstream库 @@ -142,7 +142,7 @@ docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou. ```python python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream' ``` -用浏览器打开http://serverip:8010/echo.html +用浏览器打开http://serverip:8010/echoapi.html ### 3.9 模型用musetalk 暂不支持rtmp推送 @@ -161,7 +161,7 @@ mim install "mmpose>=1.1.0" 下载数字人模型,链接: https://caiyun.139.com/m/i?2eAjs8optksop 提取码:3mkt, 解压后将整个文件夹拷到本项目的data/avatars下 - 运行 python app.py --model musetalk --transport webrtc -用浏览器打开http://serverip:8010/webrtc.html +用浏览器打开http://serverip:8010/webrtcapi.html 可以设置--batch_size 提高显卡利用率,设置--avatar_id 运行不同的数字人 #### 替换成自己的数字人 ```bash diff --git a/musereal.py b/musereal.py index 6174f58..d92ee85 100644 --- a/musereal.py +++ b/musereal.py @@ -21,7 +21,7 @@ import multiprocessing as mp from musetalk.utils.utils import get_file_type,get_video_fps,datagen #from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder from musetalk.utils.blending import get_image,get_image_prepare_material,get_image_blending -from musetalk.utils.utils import load_all_model +from musetalk.utils.utils import load_all_model,load_diffusion_model,load_audio_model from ttsreal import EdgeTTS,VoitsTTS,XTTS from museasr import MuseASR @@ -46,17 +46,17 @@ def __mirror_index(size, index): else: return size - res - 1 -def inference(render_event,batch_size,input_latent_list_cycle,audio_feat_queue,audio_out_queue,res_frame_queue, - vae, unet, pe,timesteps): +def inference(render_event,batch_size,latents_out_path,audio_feat_queue,audio_out_queue,res_frame_queue, + ): #vae, unet, pe,timesteps - # _, vae, unet, pe = load_all_model() - # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - # timesteps = torch.tensor([0], device=device) - # pe = pe.half() - # vae.vae = vae.vae.half() - # unet.model = unet.model.half() + vae, unet, pe = load_diffusion_model() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + timesteps = torch.tensor([0], device=device) + pe = pe.half() + vae.vae = vae.vae.half() + unet.model = unet.model.half() - #input_latent_list_cycle = torch.load(latents_out_path) + input_latent_list_cycle = torch.load(latents_out_path) length = len(input_latent_list_cycle) index = 0 count=0 @@ -119,7 +119,7 @@ def inference(render_event,batch_size,input_latent_list_cycle,audio_feat_queue,a #self.__pushmedia(res_frame,loop,audio_track,video_track) res_frame_queue.put((res_frame,__mirror_index(length,index),audio_frames[i*2:i*2+2])) index = index + 1 - print('total batch time:',time.perf_counter()-starttime) + #print('total batch time:',time.perf_counter()-starttime) else: time.sleep(1) print('musereal inference processor stop') @@ -166,21 +166,22 @@ class MuseReal: #self.__warm_up() self.render_event = mp.Event() - mp.Process(target=inference, args=(self.render_event,self.batch_size,self.input_latent_list_cycle, + mp.Process(target=inference, args=(self.render_event,self.batch_size,self.latents_out_path, self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue, - self.vae, self.unet, self.pe,self.timesteps)).start() + )).start() #self.vae, self.unet, self.pe,self.timesteps def __loadmodels(self): # load model weights - self.audio_processor, self.vae, self.unet, self.pe = load_all_model() - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.timesteps = torch.tensor([0], device=device) - self.pe = self.pe.half() - self.vae.vae = self.vae.vae.half() - self.unet.model = self.unet.model.half() + self.audio_processor= load_audio_model() + # self.audio_processor, self.vae, self.unet, self.pe = load_all_model() + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # self.timesteps = torch.tensor([0], device=device) + # self.pe = self.pe.half() + # self.vae.vae = self.vae.vae.half() + # self.unet.model = self.unet.model.half() def __loadavatar(self): - self.input_latent_list_cycle = torch.load(self.latents_out_path) + #self.input_latent_list_cycle = torch.load(self.latents_out_path) with open(self.coords_path, 'rb') as f: self.coord_list_cycle = pickle.load(f) input_img_list = glob.glob(os.path.join(self.full_imgs_path, '*.[jpJP][pnPN]*[gG]')) diff --git a/musetalk/utils/utils.py b/musetalk/utils/utils.py index caac0fb..f1c77ac 100644 --- a/musetalk/utils/utils.py +++ b/musetalk/utils/utils.py @@ -62,3 +62,14 @@ def datagen(whisper_chunks, latent_batch = torch.cat(latent_batch, dim=0) yield whisper_batch, latent_batch + +def load_audio_model(): + audio_processor = Audio2Feature(model_path="./models/whisper/tiny.pt") + return audio_processor + +def load_diffusion_model(): + vae = VAE(model_path = "./models/sd-vae-ft-mse/") + unet = UNet(unet_config="./models/musetalk/musetalk.json", + model_path ="./models/musetalk/pytorch_model.bin") + pe = PositionalEncoding(d_model=384) + return vae,unet,pe diff --git a/web/echoapi.html b/web/echoapi.html new file mode 100644 index 0000000..a410966 --- /dev/null +++ b/web/echoapi.html @@ -0,0 +1,73 @@ + + +
+ + + + + + + + +