diff --git a/app.py b/app.py index 45281bb..65ddb8a 100644 --- a/app.py +++ b/app.py @@ -186,6 +186,18 @@ async def record(request): ), ) +async def is_speaking(request): + params = await request.json() + + sessionid = params.get('sessionid',0) + return web.Response( + content_type="application/json", + text=json.dumps( + {"code": 0, "data": nerfreals[sessionid].is_speaking()} + ), + ) + + async def on_shutdown(app): # close peer connections coros = [pc.close() for pc in pcs] @@ -445,6 +457,7 @@ if __name__ == '__main__': appasync.router.add_post("/human", human) appasync.router.add_post("/set_audiotype", set_audiotype) appasync.router.add_post("/record", record) + appasync.router.add_post("/is_speaking", is_speaking) appasync.router.add_static('/',path='web') # Configure default CORS settings. diff --git a/baseasr.py b/baseasr.py index d58170c..7c370a3 100644 --- a/baseasr.py +++ b/baseasr.py @@ -48,6 +48,9 @@ class BaseASR: return frame,type + def is_audio_frame_empty(self)->bool: + return self.queue.empty() + def get_audio_out(self): #get origin audio pcm to nerf return self.output_queue.get() diff --git a/basereal.py b/basereal.py index 67449d3..e21e3a7 100644 --- a/basereal.py +++ b/basereal.py @@ -44,6 +44,8 @@ class BaseReal: elif opt.tts == "cosyvoice": self.tts = CosyVoiceTTS(opt,self) + self.speaking = False + self.recording = False self.recordq_video = Queue() self.recordq_audio = Queue() @@ -55,6 +57,19 @@ class BaseReal: self.custom_index = {} self.custom_opt = {} self.__loadcustom() + + def put_msg_txt(self,msg): + self.tts.put_msg_txt(msg) + + def put_audio_frame(self,audio_chunk): #16khz 20ms pcm + self.asr.put_audio_frame(audio_chunk) + + def pause_talk(self): + self.tts.pause_talk() + self.asr.pause_talk() + + def is_speaking(self)->bool: + return self.speaking def __loadcustom(self): for item in self.opt.customopt: diff --git a/lipreal.py b/lipreal.py index a18f2f7..d742fdb 100644 --- a/lipreal.py +++ b/lipreal.py @@ -191,17 +191,6 @@ class LipReal(BaseReal): input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) self.frame_list_cycle = read_imgs(input_img_list) #self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000) - - - def put_msg_txt(self,msg): - self.tts.put_msg_txt(msg) - - def put_audio_frame(self,audio_chunk): #16khz 20ms pcm - self.asr.put_audio_frame(audio_chunk) - - def pause_talk(self): - self.tts.pause_talk() - self.asr.pause_talk() def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): @@ -212,6 +201,7 @@ class LipReal(BaseReal): except queue.Empty: continue if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg + self.speaking = False audiotype = audio_frames[0][1] if self.custom_index.get(audiotype) is not None: #有自定义视频 mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) @@ -223,6 +213,7 @@ class LipReal(BaseReal): combine_frame = self.frame_list_cycle[idx] #combine_frame = self.imagecache.get_img(idx) else: + self.speaking = True bbox = self.coord_list_cycle[idx] combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) #combine_frame = copy.deepcopy(self.imagecache.get_img(idx)) diff --git a/musereal.py b/musereal.py index e5fb58a..b339d42 100644 --- a/musereal.py +++ b/musereal.py @@ -189,17 +189,6 @@ class MuseReal(BaseReal): input_mask_list = glob.glob(os.path.join(self.mask_out_path, '*.[jpJP][pnPN]*[gG]')) input_mask_list = sorted(input_mask_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) self.mask_list_cycle = read_imgs(input_mask_list) - - - def put_msg_txt(self,msg): - self.tts.put_msg_txt(msg) - - def put_audio_frame(self,audio_chunk): #16khz 20ms pcm - self.asr.put_audio_frame(audio_chunk) - - def pause_talk(self): - self.tts.pause_talk() - self.asr.pause_talk() def __mirror_index(self, index): @@ -243,6 +232,7 @@ class MuseReal(BaseReal): except queue.Empty: continue if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg + self.speaking = False audiotype = audio_frames[0][1] if self.custom_index.get(audiotype) is not None: #有自定义视频 mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) @@ -253,6 +243,7 @@ class MuseReal(BaseReal): else: combine_frame = self.frame_list_cycle[idx] else: + self.speaking = True bbox = self.coord_list_cycle[idx] ori_frame = copy.deepcopy(self.frame_list_cycle[idx]) x1, y1, x2, y2 = bbox diff --git a/nerfreal.py b/nerfreal.py index 10a221a..04805ae 100644 --- a/nerfreal.py +++ b/nerfreal.py @@ -126,17 +126,7 @@ class NeRFReal(BaseReal): def __exit__(self, exc_type, exc_value, traceback): if self.opt.asr: - self.asr.stop() - - def put_msg_txt(self,msg): - self.tts.put_msg_txt(msg) - - def put_audio_frame(self,audio_chunk): #16khz 20ms pcm - self.asr.put_audio_frame(audio_chunk) - - def pause_talk(self): - self.tts.pause_talk() - self.asr.pause_talk() + self.asr.stop() # def mirror_index(self, index): @@ -200,6 +190,11 @@ class NeRFReal(BaseReal): # # time.sleep(0.1) # asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop) #t = time.time() + if audiotype1!=0 and audiotype2!=0: #全为静音数据 + self.speaking = False + else: + self.speaking = True + if audiotype1!=0 and audiotype2!=0 and self.custom_index.get(audiotype1) is not None: #不为推理视频并且有自定义视频 mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype1]),self.custom_index[audiotype1]) #imgindex = self.mirror_index(self.customimg_index) diff --git a/web/asr/index.html b/web/asr/index.html index 68b4ae0..902518c 100644 --- a/web/asr/index.html +++ b/web/asr/index.html @@ -12,10 +12,6 @@ - -

FunASR Demo

-

这里是FunASR开源项目体验demo,集成了VAD、ASR与标点等工业级别的模型,支持长音频离线文件转写,实时语音识别等,开源项目地址:https://github.com/alibaba-damo-academy/FunASR

-
diff --git a/web/asr/main.js b/web/asr/main.js index 9655a50..bd402e9 100644 --- a/web/asr/main.js +++ b/web/asr/main.js @@ -51,12 +51,12 @@ var file_data_array; // array to save file data var totalsend=0; -var now_ipaddress=window.location.href; -now_ipaddress=now_ipaddress.replace("https://","wss://"); -now_ipaddress=now_ipaddress.replace("static/index.html",""); -var localport=window.location.port; -now_ipaddress=now_ipaddress.replace(localport,"10095"); -document.getElementById('wssip').value=now_ipaddress; +// var now_ipaddress=window.location.href; +// now_ipaddress=now_ipaddress.replace("https://","wss://"); +// now_ipaddress=now_ipaddress.replace("static/index.html",""); +// var localport=window.location.port; +// now_ipaddress=now_ipaddress.replace(localport,"10095"); +// document.getElementById('wssip').value=now_ipaddress; addresschange(); function addresschange() { @@ -343,6 +343,43 @@ function handleWithTimestamp(tmptext,tmptime) } + +const sleep = (delay) => new Promise((resolve) => setTimeout(resolve, delay)) +async function is_speaking() { + const response = await fetch('/is_speaking', { + body: JSON.stringify({ + sessionid: 0, + }), + headers: { + 'Content-Type': 'application/json' + }, + method: 'POST' + }); + const data = await response.json(); + console.log('is_speaking res:',data) + return data.data +} + +async function waitSpeakingEnd() { + rec.stop() //关闭录音 + for(let i=0;i<10;i++) { //等待数字人开始讲话,最长等待10s + bspeak = await is_speaking() + if(bspeak) { + break + } + await sleep(1000) + } + + while(true) { //等待数字人讲话结束 + bspeak = await is_speaking() + if(!bspeak) { + break + } + await sleep(1000) + } + await sleep(2000) + rec.start() +} // 语音识别结果; 对jsonMsg数据解析,将识别结果附加到编辑框中 function getJsonMessage( jsonMsg ) { //console.log(jsonMsg); @@ -353,9 +390,20 @@ function getJsonMessage( jsonMsg ) { var timestamp=JSON.parse(jsonMsg.data)['timestamp']; if(asrmodel=="2pass-offline" || asrmodel=="offline") { - - offline_text=offline_text+handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,""); + offline_text=offline_text+rectxt.replace(/ +/g,"")+'\n'; //handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,""); rec_text=offline_text; + fetch('/human', { + body: JSON.stringify({ + text: rectxt.replace(/ +/g,""), + type: 'echo', + }), + headers: { + 'Content-Type': 'application/json' + }, + method: 'POST' + }); + + waitSpeakingEnd(); } else { diff --git a/web/rtcpushapi-asr.html b/web/rtcpushapi-asr.html new file mode 100644 index 0000000..e09eeb4 --- /dev/null +++ b/web/rtcpushapi-asr.html @@ -0,0 +1,136 @@ + + + + + + WebRTC webcam + + + + +
+ + +
+ +
+
+

input text

+ + +
+ +
+ +
+

Media

+ + +
+ + + + + + + + diff --git a/web/webrtcapi-asr.html b/web/webrtcapi-asr.html new file mode 100644 index 0000000..2fee033 --- /dev/null +++ b/web/webrtcapi-asr.html @@ -0,0 +1,186 @@ + + + + + + WebRTC webcam + + + + +
+ + +
+ + + + + + +
+
+

input text

+ + +
+ +
+ +
+

Media

+ + + +
+ + + + + + + + +