add audio asr input

2024-09-17 22:11:46 +08:00 · 2024-09-17 22:11:46 +08:00 · a8b40fa813
parent 8d5a38222b
commit a8b40fa813
10 changed files with 419 additions and 45 deletions
--- a/app.py
+++ b/app.py
@ -186,6 +186,18 @@ async def record(request):
        ),
    )

+async def is_speaking(request):
+    params = await request.json()
+
+    sessionid = params.get('sessionid',0)
+    return web.Response(
+        content_type="application/json",
+        text=json.dumps(
+            {"code": 0, "data": nerfreals[sessionid].is_speaking()}
+        ),
+    )
+
+
 async def on_shutdown(app):
    # close peer connections
    coros = [pc.close() for pc in pcs]
@ -445,6 +457,7 @@ if __name__ == '__main__':
    appasync.router.add_post("/human", human)
    appasync.router.add_post("/set_audiotype", set_audiotype)
    appasync.router.add_post("/record", record)
+    appasync.router.add_post("/is_speaking", is_speaking)
    appasync.router.add_static('/',path='web')

    # Configure default CORS settings.
--- a/baseasr.py
+++ b/baseasr.py
@ -48,6 +48,9 @@ class BaseASR:

        return frame,type 

+    def is_audio_frame_empty(self)->bool:
+        return self.queue.empty()
+
    def get_audio_out(self):  #get origin audio pcm to nerf
        return self.output_queue.get()
    
--- a/basereal.py
+++ b/basereal.py
@ -44,6 +44,8 @@ class BaseReal:
        elif opt.tts == "cosyvoice":
            self.tts = CosyVoiceTTS(opt,self)
        
+        self.speaking = False
+
        self.recording = False
        self.recordq_video = Queue()
        self.recordq_audio = Queue()
@ -55,6 +57,19 @@ class BaseReal:
        self.custom_index = {}
        self.custom_opt = {}
        self.__loadcustom()
+
+    def put_msg_txt(self,msg):
+        self.tts.put_msg_txt(msg)
+    
+    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
+        self.asr.put_audio_frame(audio_chunk)
+
+    def pause_talk(self):
+        self.tts.pause_talk()
+        self.asr.pause_talk()
+
+    def is_speaking(self)->bool:
+        return self.speaking
    
    def __loadcustom(self):
        for item in self.opt.customopt:
--- a/lipreal.py
+++ b/lipreal.py
@ -191,17 +191,6 @@ class LipReal(BaseReal):
        input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
        self.frame_list_cycle = read_imgs(input_img_list)
        #self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000)
-        
-    
-    def put_msg_txt(self,msg):
-        self.tts.put_msg_txt(msg)
-    
-    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
-        self.asr.put_audio_frame(audio_chunk)
-
-    def pause_talk(self):
-        self.tts.pause_talk()
-        self.asr.pause_talk()
      

    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
@ -212,6 +201,7 @@ class LipReal(BaseReal):
            except queue.Empty:
                continue
            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
+                self.speaking = False
                audiotype = audio_frames[0][1]
                if self.custom_index.get(audiotype) is not None: #有自定义视频
                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
@ -223,6 +213,7 @@ class LipReal(BaseReal):
                    combine_frame = self.frame_list_cycle[idx]
                    #combine_frame = self.imagecache.get_img(idx)
            else:
+                self.speaking = True
                bbox = self.coord_list_cycle[idx]
                combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
                #combine_frame = copy.deepcopy(self.imagecache.get_img(idx))
--- a/musereal.py
+++ b/musereal.py
@ -189,17 +189,6 @@ class MuseReal(BaseReal):
        input_mask_list = glob.glob(os.path.join(self.mask_out_path, '*.[jpJP][pnPN]*[gG]'))
        input_mask_list = sorted(input_mask_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
        self.mask_list_cycle = read_imgs(input_mask_list)
-        
-    
-    def put_msg_txt(self,msg):
-        self.tts.put_msg_txt(msg)
-    
-    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
-        self.asr.put_audio_frame(audio_chunk)
-
-    def pause_talk(self):
-        self.tts.pause_talk()
-        self.asr.pause_talk()
    

    def __mirror_index(self, index):
@ -243,6 +232,7 @@ class MuseReal(BaseReal):
            except queue.Empty:
                continue
            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
+                self.speaking = False
                audiotype = audio_frames[0][1]
                if self.custom_index.get(audiotype) is not None: #有自定义视频
                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
@ -253,6 +243,7 @@ class MuseReal(BaseReal):
                else:
                    combine_frame = self.frame_list_cycle[idx]
            else:
+                self.speaking = True
                bbox = self.coord_list_cycle[idx]
                ori_frame = copy.deepcopy(self.frame_list_cycle[idx])
                x1, y1, x2, y2 = bbox
--- a/nerfreal.py
+++ b/nerfreal.py
@ -126,17 +126,7 @@ class NeRFReal(BaseReal):

    def __exit__(self, exc_type, exc_value, traceback):
        if self.opt.asr:
-            self.asr.stop()
-
-    def put_msg_txt(self,msg):
-        self.tts.put_msg_txt(msg)
-
-    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
-        self.asr.put_audio_frame(audio_chunk)
-
-    def pause_talk(self):
-        self.tts.pause_talk()
-        self.asr.pause_talk()   
+            self.asr.stop()  
    

    # def mirror_index(self, index):
@ -200,6 +190,11 @@ class NeRFReal(BaseReal):
        #         #     time.sleep(0.1)
        #         asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)  
        #t = time.time()
+        if audiotype1!=0 and audiotype2!=0: #全为静音数据
+            self.speaking = False
+        else:
+            self.speaking = True
+            
        if audiotype1!=0 and audiotype2!=0 and self.custom_index.get(audiotype1) is not None: #不为推理视频并且有自定义视频
            mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype1]),self.custom_index[audiotype1])
            #imgindex  = self.mirror_index(self.customimg_index)
--- a/web/asr/index.html
+++ b/web/asr/index.html
@ -12,10 +12,6 @@
 		<script src="pcm.js" charset="UTF-8"></script>

 		 
-
-        <h1>FunASR Demo</h1>
-						<h3>这里是FunASR开源项目体验demo，集成了VAD、ASR与标点等工业级别的模型，支持长音频离线文件转写，实时语音识别等，开源项目地址：https://github.com/alibaba-damo-academy/FunASR</h3>
-
 		<div class="div_class_topArea">

 			<div class="div_class_recordControl">
--- a/web/asr/main.js
+++ b/web/asr/main.js
@ -51,12 +51,12 @@ var file_data_array;  // array to save file data
 var totalsend=0;


-var now_ipaddress=window.location.href;
-now_ipaddress=now_ipaddress.replace("https://","wss://");
-now_ipaddress=now_ipaddress.replace("static/index.html","");
-var localport=window.location.port;
-now_ipaddress=now_ipaddress.replace(localport,"10095");
-document.getElementById('wssip').value=now_ipaddress;
+// var now_ipaddress=window.location.href;
+// now_ipaddress=now_ipaddress.replace("https://","wss://");
+// now_ipaddress=now_ipaddress.replace("static/index.html","");
+// var localport=window.location.port;
+// now_ipaddress=now_ipaddress.replace(localport,"10095");
+// document.getElementById('wssip').value=now_ipaddress;
 addresschange();
 function addresschange()
 {   
@ -343,6 +343,43 @@ function handleWithTimestamp(tmptext,tmptime)
 	

 }
+
+const sleep = (delay) => new Promise((resolve) => setTimeout(resolve, delay))
+async function is_speaking() {
+	const response = await fetch('/is_speaking', {
+		body: JSON.stringify({
+			sessionid: 0,
+		}),
+		headers: {
+			'Content-Type': 'application/json'
+		},
+		method: 'POST'
+	  });
+	const data = await response.json();
+	console.log('is_speaking res:',data)
+	return data.data
+}
+
+async function waitSpeakingEnd() {
+	rec.stop() //关闭录音
+	for(let i=0;i<10;i++) {  //等待数字人开始讲话，最长等待10s
+		bspeak = await is_speaking()
+		if(bspeak) {
+			break
+		}
+		await sleep(1000)
+	}
+
+	while(true) {  //等待数字人讲话结束
+		bspeak = await is_speaking()
+		if(!bspeak) {
+			break
+		}
+		await sleep(1000)
+	}
+	await sleep(2000)
+	rec.start() 
+}
 // 语音识别结果; 对jsonMsg数据解析,将识别结果附加到编辑框中
 function getJsonMessage( jsonMsg ) {
 	//console.log(jsonMsg);
@ -353,9 +390,20 @@ function getJsonMessage( jsonMsg ) {
 	var timestamp=JSON.parse(jsonMsg.data)['timestamp'];
 	if(asrmodel=="2pass-offline" || asrmodel=="offline")
 	{
-		
-		offline_text=offline_text+handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,"");
+		offline_text=offline_text+rectxt.replace(/ +/g,"")+'\n'; //handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,"");
 		rec_text=offline_text;
+		fetch('/human', {
+            body: JSON.stringify({
+                text: rectxt.replace(/ +/g,""),
+                type: 'echo',
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      	});
+
+		waitSpeakingEnd();
 	}
 	else
 	{
--- a/web/rtcpushapi-asr.html
+++ b/web/rtcpushapi-asr.html
@ -0,0 +1,136 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>WebRTC webcam</title>
+    <style>
+    button {
+        padding: 8px 16px;
+    }
+
+    video {
+        width: 100%;
+    }
+
+    .option {
+        margin-bottom: 8px;
+    }
+
+    #media {
+        max-width: 1280px;
+    }
+    </style>
+</head>
+<body>
+
+<div class="option">
+    <input id="use-stun" type="checkbox"/>
+    <label for="use-stun">Use STUN server</label>
+</div>
+<button class="btn btn-primary" id="btn_play">Start</button>
+<form class="form-inline" id="echo-form">
+    <div class="form-group">
+      <p>input text</p>
+
+      <textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
+    </div>
+    <button type="submit" class="btn btn-default">Send</button>
+  </form>
+
+<div id="media">
+    <h2>Media</h2>
+
+    <video id="rtc_media_player" style="width:600px;" controls autoplay></video>
+</div>
+<iframe src="asr/index.html" width="600" height="500"></iframe>
+
+<script src="srs.sdk.js"></script>
+<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
+<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
+</body>
+<script type="text/javascript" charset="utf-8">
+
+	$(document).ready(function() {
+	  // var host = window.location.hostname
+	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
+	  // //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  // ws.onopen = function() {
+		// console.log('Connected');
+	  // };
+	  // ws.onmessage = function(e) {
+		// console.log('Received: ' + e.data);
+		// data = e
+		// var vid = JSON.parse(data.data); 
+		// console.log(typeof(vid),vid)
+		// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+		
+	  // };
+	  // ws.onclose = function(e) {
+		// console.log('Closed');
+	  // };
+
+	  $('#echo-form').on('submit', function(e) {
+      e.preventDefault();
+      var message = $('#message').val();
+      console.log('Sending: ' + message);
+      fetch('/human', {
+            body: JSON.stringify({
+                text: message,
+                type: 'echo',
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+      //ws.send(message);
+      $('#message').val('');
+	  });
+	});
+
+  $(function(){
+    var sdk = null; // Global handler to do cleanup when republishing.
+    var startPlay = function() {
+        $('#rtc_media_player').show();
+
+        // Close PC when user replay.
+        if (sdk) {
+            sdk.close();
+        }
+        sdk = new SrsRtcWhipWhepAsync();
+
+        // User should set the stream when publish is done, @see https://webrtc.org/getting-started/media-devices
+        // However SRS SDK provides a consist API like https://webrtc.org/getting-started/remote-streams
+        $('#rtc_media_player').prop('srcObject', sdk.stream);
+        // Optional callback, SDK will add track to stream.
+        // sdk.ontrack = function (event) { console.log('Got track', event); sdk.stream.addTrack(event.track); };
+
+        var host = window.location.hostname
+        // For example: webrtc://r.ossrs.net/live/livestream
+        var url = "http://"+host+":1985/rtc/v1/whep/?app=live&stream=livestream"
+        sdk.play(url).then(function(session){
+            //$('#sessionid').html(session.sessionid);
+            //$('#simulator-drop').attr('href', session.simulator + '?drop=1&username=' + session.sessionid);
+        }).catch(function (reason) {
+            sdk.close();
+            $('#rtc_media_player').hide();
+            console.error(reason);
+        });
+    };
+
+    $('#rtc_media_player').hide();
+    // var query = parse_query_string();
+    // srs_init_whep("#txt_url", query);
+
+    $("#btn_play").click(startPlay);
+    // Never play util windows loaded @see https://github.com/ossrs/srs/issues/2732
+    // if (query.autostart === 'true') {
+    //     $('#rtc_media_player').prop('muted', true);
+    //     console.warn('For autostart, we should mute it, see https://www.jianshu.com/p/c3c6944eed5a ' +
+    //         'or https://developers.google.com/web/updates/2017/09/autoplay-policy-changes#audiovideo_elements');
+    //     window.addEventListener("load", function(){ startPlay(); });
+    // }
+});
+</script>
+</html>
--- a/web/webrtcapi-asr.html
+++ b/web/webrtcapi-asr.html
@ -0,0 +1,186 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>WebRTC webcam</title>
+    <style>
+    button {
+        padding: 8px 16px;
+    }
+
+    video {
+        width: 100%;
+    }
+
+    .option {
+        margin-bottom: 8px;
+    }
+
+    #media {
+        max-width: 1280px;
+    }
+    </style>
+</head>
+<body>
+
+<div class="option">
+    <input id="use-stun" type="checkbox"/>
+    <label for="use-stun">Use STUN server</label>
+</div>
+<button id="start" onclick="start()">Start</button>
+<button id="stop" style="display: none" onclick="stop()">Stop</button>
+<button class="btn btn-primary" id="btn_start_record">Start Recording</button>
+<button class="btn btn-primary" id="btn_stop_record" disabled>Stop Recording</button>
+<!-- <button class="btn btn-primary" id="btn_download">Download Video</button> -->
+<input type="hidden" id="sessionid" value="0">
+<form class="form-inline" id="echo-form">
+    <div class="form-group">
+      <p>input text</p>
+
+      <textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
+    </div>
+    <button type="submit" class="btn btn-default">Send</button>
+  </form>
+
+<div id="media">
+    <h2>Media</h2>
+
+    <audio id="audio" autoplay="true"></audio>
+    <video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
+</div>
+
+<iframe src="asr/index.html" width="600" height="500"></iframe>
+
+<script src="client.js"></script>
+<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
+<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
+</body>
+<script type="text/javascript" charset="utf-8">
+
+	$(document).ready(function() {
+	  // var host = window.location.hostname
+	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
+	  // //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  // ws.onopen = function() {
+		// console.log('Connected');
+	  // };
+	  // ws.onmessage = function(e) {
+		// console.log('Received: ' + e.data);
+		// data = e
+		// var vid = JSON.parse(data.data); 
+		// console.log(typeof(vid),vid)
+		// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+		
+	  // };
+	  // ws.onclose = function(e) {
+		// console.log('Closed');
+	  // };
+
+	  $('#echo-form').on('submit', function(e) {
+      e.preventDefault();
+      var message = $('#message').val();
+      console.log('Sending: ' + message);
+      console.log('sessionid: ',document.getElementById('sessionid').value);
+      fetch('/human', {
+            body: JSON.stringify({
+                text: message,
+                type: 'echo',
+                interrupt: true,
+                sessionid:parseInt(document.getElementById('sessionid').value),
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+      //ws.send(message);
+      $('#message').val('');
+	  });
+
+    $('#btn_start_record').click(function() {
+        // 开始录制
+        console.log('Starting recording...');
+        fetch('/record', {
+            body: JSON.stringify({
+                    type: 'start_record',
+                }),
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+            method: 'POST'
+        }).then(function(response) {
+            if (response.ok) {
+                console.log('Recording started.');
+                $('#btn_start_record').prop('disabled', true);
+                $('#btn_stop_record').prop('disabled', false);
+                // $('#btn_download').prop('disabled', true);
+            } else {
+                console.error('Failed to start recording.');
+            }
+        }).catch(function(error) {
+            console.error('Error:', error);
+        });
+    });
+
+    $('#btn_stop_record').click(function() {
+        // 结束录制
+        console.log('Stopping recording...');
+        fetch('/record', {
+            body: JSON.stringify({
+                    type: 'end_record',
+                }),
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+            method: 'POST'
+        }).then(function(response) {
+            if (response.ok) {
+                console.log('Recording stopped.');
+                $('#btn_start_record').prop('disabled', false);
+                $('#btn_stop_record').prop('disabled', true);
+                // $('#btn_download').prop('disabled', false);
+            } else {
+                console.error('Failed to stop recording.');
+            }
+        }).catch(function(error) {
+            console.error('Error:', error);
+        });
+    });
+
+    // $('#btn_download').click(function() {
+    //     // 下载视频文件
+    //     console.log('Downloading video...');
+    //     fetch('/record_lasted.mp4', {
+    //         method: 'GET'
+    //     }).then(function(response) {
+    //         if (response.ok) {
+    //             return response.blob();
+    //         } else {
+    //             throw new Error('Failed to download the video.');
+    //         }
+    //     }).then(function(blob) {
+    //         // 创建一个 Blob 对象
+    //         const url = window.URL.createObjectURL(blob);
+    //         // 创建一个隐藏的可下载链接
+    //         const a = document.createElement('a');
+    //         a.style.display = 'none';
+    //         a.href = url;
+    //         a.download = 'record_lasted.mp4';
+    //         document.body.appendChild(a);
+    //         // 触发下载
+    //         a.click();
+    //         // 清理
+    //         window.URL.revokeObjectURL(url);
+    //         document.body.removeChild(a);
+    //         console.log('Video downloaded successfully.');
+    //     }).catch(function(error) {
+    //         console.error('Error:', error);
+    //     });
+    // });
+
+	});
+
+  
+</script>
+</html>