diff --git a/app.py b/app.py index 45281bb..65ddb8a 100644 --- a/app.py +++ b/app.py @@ -186,6 +186,18 @@ async def record(request): ), ) +async def is_speaking(request): + params = await request.json() + + sessionid = params.get('sessionid',0) + return web.Response( + content_type="application/json", + text=json.dumps( + {"code": 0, "data": nerfreals[sessionid].is_speaking()} + ), + ) + + async def on_shutdown(app): # close peer connections coros = [pc.close() for pc in pcs] @@ -445,6 +457,7 @@ if __name__ == '__main__': appasync.router.add_post("/human", human) appasync.router.add_post("/set_audiotype", set_audiotype) appasync.router.add_post("/record", record) + appasync.router.add_post("/is_speaking", is_speaking) appasync.router.add_static('/',path='web') # Configure default CORS settings. diff --git a/baseasr.py b/baseasr.py index d58170c..7c370a3 100644 --- a/baseasr.py +++ b/baseasr.py @@ -48,6 +48,9 @@ class BaseASR: return frame,type + def is_audio_frame_empty(self)->bool: + return self.queue.empty() + def get_audio_out(self): #get origin audio pcm to nerf return self.output_queue.get() diff --git a/basereal.py b/basereal.py index 67449d3..e21e3a7 100644 --- a/basereal.py +++ b/basereal.py @@ -44,6 +44,8 @@ class BaseReal: elif opt.tts == "cosyvoice": self.tts = CosyVoiceTTS(opt,self) + self.speaking = False + self.recording = False self.recordq_video = Queue() self.recordq_audio = Queue() @@ -55,6 +57,19 @@ class BaseReal: self.custom_index = {} self.custom_opt = {} self.__loadcustom() + + def put_msg_txt(self,msg): + self.tts.put_msg_txt(msg) + + def put_audio_frame(self,audio_chunk): #16khz 20ms pcm + self.asr.put_audio_frame(audio_chunk) + + def pause_talk(self): + self.tts.pause_talk() + self.asr.pause_talk() + + def is_speaking(self)->bool: + return self.speaking def __loadcustom(self): for item in self.opt.customopt: diff --git a/lipreal.py b/lipreal.py index a18f2f7..d742fdb 100644 --- a/lipreal.py +++ b/lipreal.py @@ -191,17 +191,6 @@ class LipReal(BaseReal): input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) self.frame_list_cycle = read_imgs(input_img_list) #self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000) - - - def put_msg_txt(self,msg): - self.tts.put_msg_txt(msg) - - def put_audio_frame(self,audio_chunk): #16khz 20ms pcm - self.asr.put_audio_frame(audio_chunk) - - def pause_talk(self): - self.tts.pause_talk() - self.asr.pause_talk() def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): @@ -212,6 +201,7 @@ class LipReal(BaseReal): except queue.Empty: continue if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg + self.speaking = False audiotype = audio_frames[0][1] if self.custom_index.get(audiotype) is not None: #有自定义视频 mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) @@ -223,6 +213,7 @@ class LipReal(BaseReal): combine_frame = self.frame_list_cycle[idx] #combine_frame = self.imagecache.get_img(idx) else: + self.speaking = True bbox = self.coord_list_cycle[idx] combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) #combine_frame = copy.deepcopy(self.imagecache.get_img(idx)) diff --git a/musereal.py b/musereal.py index e5fb58a..b339d42 100644 --- a/musereal.py +++ b/musereal.py @@ -189,17 +189,6 @@ class MuseReal(BaseReal): input_mask_list = glob.glob(os.path.join(self.mask_out_path, '*.[jpJP][pnPN]*[gG]')) input_mask_list = sorted(input_mask_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) self.mask_list_cycle = read_imgs(input_mask_list) - - - def put_msg_txt(self,msg): - self.tts.put_msg_txt(msg) - - def put_audio_frame(self,audio_chunk): #16khz 20ms pcm - self.asr.put_audio_frame(audio_chunk) - - def pause_talk(self): - self.tts.pause_talk() - self.asr.pause_talk() def __mirror_index(self, index): @@ -243,6 +232,7 @@ class MuseReal(BaseReal): except queue.Empty: continue if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg + self.speaking = False audiotype = audio_frames[0][1] if self.custom_index.get(audiotype) is not None: #有自定义视频 mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) @@ -253,6 +243,7 @@ class MuseReal(BaseReal): else: combine_frame = self.frame_list_cycle[idx] else: + self.speaking = True bbox = self.coord_list_cycle[idx] ori_frame = copy.deepcopy(self.frame_list_cycle[idx]) x1, y1, x2, y2 = bbox diff --git a/nerfreal.py b/nerfreal.py index 10a221a..04805ae 100644 --- a/nerfreal.py +++ b/nerfreal.py @@ -126,17 +126,7 @@ class NeRFReal(BaseReal): def __exit__(self, exc_type, exc_value, traceback): if self.opt.asr: - self.asr.stop() - - def put_msg_txt(self,msg): - self.tts.put_msg_txt(msg) - - def put_audio_frame(self,audio_chunk): #16khz 20ms pcm - self.asr.put_audio_frame(audio_chunk) - - def pause_talk(self): - self.tts.pause_talk() - self.asr.pause_talk() + self.asr.stop() # def mirror_index(self, index): @@ -200,6 +190,11 @@ class NeRFReal(BaseReal): # # time.sleep(0.1) # asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop) #t = time.time() + if audiotype1!=0 and audiotype2!=0: #全为静音数据 + self.speaking = False + else: + self.speaking = True + if audiotype1!=0 and audiotype2!=0 and self.custom_index.get(audiotype1) is not None: #不为推理视频并且有自定义视频 mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype1]),self.custom_index[audiotype1]) #imgindex = self.mirror_index(self.customimg_index) diff --git a/web/asr/index.html b/web/asr/index.html index 68b4ae0..902518c 100644 --- a/web/asr/index.html +++ b/web/asr/index.html @@ -12,10 +12,6 @@ - -