diff --git a/app.py b/app.py index 4f3c255..9292cd0 100644 --- a/app.py +++ b/app.py @@ -140,7 +140,7 @@ async def human(request): if params['type']=='echo': nerfreals[sessionid].put_msg_txt(params['text']) elif params['type']=='chat': - res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text'])) + res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text'])) nerfreals[sessionid].put_msg_txt(res) return web.Response( @@ -150,6 +150,19 @@ async def human(request): ), ) +async def set_audiotype(request): + params = await request.json() + + sessionid = params.get('sessionid',0) + nerfreals[sessionid].set_curr_state(params['audiotype'],params['reinit']) + + return web.Response( + content_type="application/json", + text=json.dumps( + {"code": 0, "data":"ok"} + ), + ) + async def on_shutdown(app): # close peer connections coros = [pc.close() for pc in pcs] @@ -307,6 +320,8 @@ if __name__ == '__main__': parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img') parser.add_argument('--customvideo_imgnum', type=int, default=1) + parser.add_argument('--customvideo_config', type=str, default='') + parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits parser.add_argument('--REF_FILE', type=str, default=None) parser.add_argument('--REF_TEXT', type=str, default=None) @@ -325,6 +340,10 @@ if __name__ == '__main__': opt = parser.parse_args() #app.config.from_object(opt) #print(app.config) + opt.customopt = [] + if opt.customvideo_config!='': + with open(opt.customvideo_config,'r') as file: + opt.customopt = json.load(file) if opt.model == 'ernerf': from ernerf.nerf_triplane.provider import NeRFDataset_Test @@ -402,6 +421,7 @@ if __name__ == '__main__': appasync.on_shutdown.append(on_shutdown) appasync.router.add_post("/offer", offer) appasync.router.add_post("/human", human) + appasync.router.add_post("/set_audiotype", set_audiotype) appasync.router.add_static('/',path='web') # Configure default CORS settings. diff --git a/baseasr.py b/baseasr.py index df66873..d58170c 100644 --- a/baseasr.py +++ b/baseasr.py @@ -7,8 +7,9 @@ import multiprocessing as mp class BaseASR: - def __init__(self, opt): + def __init__(self, opt, parent=None): self.opt = opt + self.parent = parent self.fps = opt.fps # 20 ms per frame self.sample_rate = 16000 @@ -38,8 +39,12 @@ class BaseASR: type = 0 #print(f'[INFO] get frame {frame.shape}') except queue.Empty: - frame = np.zeros(self.chunk, dtype=np.float32) - type = 1 + if self.parent and self.parent.curr_state>1: #播放自定义音频 + frame = self.parent.get_audio_stream(self.parent.curr_state) + type = self.parent.curr_state + else: + frame = np.zeros(self.chunk, dtype=np.float32) + type = 1 return frame,type diff --git a/basereal.py b/basereal.py new file mode 100644 index 0000000..4a7f5de --- /dev/null +++ b/basereal.py @@ -0,0 +1,81 @@ +import math +import torch +import numpy as np + +import os +import time +import cv2 +import glob +import pickle +import copy + +import queue +from queue import Queue +from threading import Thread, Event +from io import BytesIO +import soundfile as sf + +from tqdm import tqdm +def read_imgs(img_list): + frames = [] + print('reading images...') + for img_path in tqdm(img_list): + frame = cv2.imread(img_path) + frames.append(frame) + return frames + +class BaseReal: + def __init__(self, opt): + self.opt = opt + self.sample_rate = 16000 + self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000) + + self.curr_state=0 + self.custom_img_cycle = {} + self.custom_audio_cycle = {} + self.custom_audio_index = {} + self.custom_index = {} + self.custom_opt = {} + self.__loadcustom() + + def __loadcustom(self): + for item in self.opt.customopt: + print(item) + input_img_list = glob.glob(os.path.join(item['imgpath'], '*.[jpJP][pnPN]*[gG]')) + input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) + self.custom_img_cycle[item['audiotype']] = read_imgs(input_img_list) + self.custom_audio_cycle[item['audiotype']], sample_rate = sf.read(item['audiopath'], dtype='float32') + self.custom_audio_index[item['audiotype']] = 0 + self.custom_index[item['audiotype']] = 0 + self.custom_opt[item['audiotype']] = item + + def mirror_index(self,size, index): + #size = len(self.coord_list_cycle) + turn = index // size + res = index % size + if turn % 2 == 0: + return res + else: + return size - res - 1 + + def get_audio_stream(self,audiotype): + idx = self.custom_audio_index[audiotype] + stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk] + self.custom_audio_index[audiotype] += self.chunk + if self.custom_audio_index[audiotype]>=stream.shape[0]: + self.curr_state = 1 #当前视频不循环播放,切换到静音状态 + return stream + + def set_curr_state(self,audiotype, reinit): + self.curr_state = audiotype + if reinit: + self.custom_audio_index[audiotype] = 0 + self.custom_index[audiotype] = 0 + + # def process_custom(self,audiotype:int,idx:int): + # if self.curr_state!=audiotype: #从推理切到口播 + # if idx in self.switch_pos: #在卡点位置可以切换 + # self.curr_state=audiotype + # self.custom_index=0 + # else: + # self.custom_index+=1 \ No newline at end of file diff --git a/data/custom_config.json b/data/custom_config.json new file mode 100644 index 0000000..9fc54a7 --- /dev/null +++ b/data/custom_config.json @@ -0,0 +1,7 @@ +[ + { + "audiotype":2, + "imgpath":"data/customvideo/image", + "audiopath":"data/customvideo/audio.wav" + } +] \ No newline at end of file diff --git a/lipreal.py b/lipreal.py index 9461e7b..460c43f 100644 --- a/lipreal.py +++ b/lipreal.py @@ -23,8 +23,8 @@ from ttsreal import EdgeTTS,VoitsTTS,XTTS from lipasr import LipASR import asyncio from av import AudioFrame, VideoFrame - from wav2lip.models import Wav2Lip +from basereal import BaseReal from tqdm import tqdm @@ -143,9 +143,10 @@ def inference(render_event,batch_size,face_imgs_path,audio_feat_queue,audio_out_ print('musereal inference processor stop') @torch.no_grad() -class LipReal: +class LipReal(BaseReal): def __init__(self, opt): - self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters. + super().__init__(opt) + #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters. self.W = opt.W self.H = opt.H @@ -163,7 +164,7 @@ class LipReal: #self.__loadmodels() self.__loadavatar() - self.asr = LipASR(opt) + self.asr = LipASR(opt,self) self.asr.warm_up() if opt.tts == "edgetts": self.tts = EdgeTTS(opt,self) @@ -213,8 +214,16 @@ class LipReal: res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) except queue.Empty: continue - if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据,只需要取fullimg - combine_frame = self.frame_list_cycle[idx] + if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg + audiotype = audio_frames[0][1] + if self.custom_index.get(audiotype) is not None: #有自定义视频 + mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) + combine_frame = self.custom_img_cycle[audiotype][mirindex] + self.custom_index[audiotype] += 1 + # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]): + # self.curr_state = 1 #当前视频不循环播放,切换到静音状态 + else: + combine_frame = self.frame_list_cycle[idx] else: bbox = self.coord_list_cycle[idx] combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) diff --git a/llm/VllmGPT.py b/llm/VllmGPT.py index a8db6aa..b5ae5ec 100644 --- a/llm/VllmGPT.py +++ b/llm/VllmGPT.py @@ -15,7 +15,7 @@ class VllmGPT: self.__URL = "http://{}:{}/v1/completions".format(self.host, self.port) self.__URL2 = "http://{}:{}/v1/chat/completions".format(self.host, self.port) - def question(self,cont): + def chat(self,cont): chat_list = [] # contentdb = content_db.new_instance() # list = contentdb.get_list('all','desc',11) @@ -77,5 +77,5 @@ class VllmGPT: if __name__ == "__main__": vllm = VllmGPT('192.168.1.3','8101') - req = vllm.question("你叫什么名字啊今年多大了") + req = vllm.chat("你叫什么名字啊今年多大了") print(req) diff --git a/web/webrtcapi-custom.html b/web/webrtcapi-custom.html new file mode 100644 index 0000000..eaef394 --- /dev/null +++ b/web/webrtcapi-custom.html @@ -0,0 +1,113 @@ + + + + + + WebRTC webcam + + + + +
+ + +
+ + + +
+
+

input text

+ + +
+ +
+ +
+

Media

+ + + +
+ + + + + + + + + diff --git a/web/webrtcapi.html b/web/webrtcapi.html index 7f874a9..af269d4 100644 --- a/web/webrtcapi.html +++ b/web/webrtcapi.html @@ -30,7 +30,7 @@ - +

input text