add audio asr input

This commit is contained in:
lipku 2024-09-17 22:11:46 +08:00
parent 8d5a38222b
commit a8b40fa813
10 changed files with 419 additions and 45 deletions

13
app.py
View File

@ -186,6 +186,18 @@ async def record(request):
),
)
async def is_speaking(request):
params = await request.json()
sessionid = params.get('sessionid',0)
return web.Response(
content_type="application/json",
text=json.dumps(
{"code": 0, "data": nerfreals[sessionid].is_speaking()}
),
)
async def on_shutdown(app):
# close peer connections
coros = [pc.close() for pc in pcs]
@ -445,6 +457,7 @@ if __name__ == '__main__':
appasync.router.add_post("/human", human)
appasync.router.add_post("/set_audiotype", set_audiotype)
appasync.router.add_post("/record", record)
appasync.router.add_post("/is_speaking", is_speaking)
appasync.router.add_static('/',path='web')
# Configure default CORS settings.

View File

@ -48,6 +48,9 @@ class BaseASR:
return frame,type
def is_audio_frame_empty(self)->bool:
return self.queue.empty()
def get_audio_out(self): #get origin audio pcm to nerf
return self.output_queue.get()

View File

@ -44,6 +44,8 @@ class BaseReal:
elif opt.tts == "cosyvoice":
self.tts = CosyVoiceTTS(opt,self)
self.speaking = False
self.recording = False
self.recordq_video = Queue()
self.recordq_audio = Queue()
@ -55,6 +57,19 @@ class BaseReal:
self.custom_index = {}
self.custom_opt = {}
self.__loadcustom()
def put_msg_txt(self,msg):
self.tts.put_msg_txt(msg)
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
self.asr.put_audio_frame(audio_chunk)
def pause_talk(self):
self.tts.pause_talk()
self.asr.pause_talk()
def is_speaking(self)->bool:
return self.speaking
def __loadcustom(self):
for item in self.opt.customopt:

View File

@ -191,17 +191,6 @@ class LipReal(BaseReal):
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
self.frame_list_cycle = read_imgs(input_img_list)
#self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000)
def put_msg_txt(self,msg):
self.tts.put_msg_txt(msg)
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
self.asr.put_audio_frame(audio_chunk)
def pause_talk(self):
self.tts.pause_talk()
self.asr.pause_talk()
def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
@ -212,6 +201,7 @@ class LipReal(BaseReal):
except queue.Empty:
continue
if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据只需要取fullimg
self.speaking = False
audiotype = audio_frames[0][1]
if self.custom_index.get(audiotype) is not None: #有自定义视频
mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
@ -223,6 +213,7 @@ class LipReal(BaseReal):
combine_frame = self.frame_list_cycle[idx]
#combine_frame = self.imagecache.get_img(idx)
else:
self.speaking = True
bbox = self.coord_list_cycle[idx]
combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
#combine_frame = copy.deepcopy(self.imagecache.get_img(idx))

View File

@ -189,17 +189,6 @@ class MuseReal(BaseReal):
input_mask_list = glob.glob(os.path.join(self.mask_out_path, '*.[jpJP][pnPN]*[gG]'))
input_mask_list = sorted(input_mask_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
self.mask_list_cycle = read_imgs(input_mask_list)
def put_msg_txt(self,msg):
self.tts.put_msg_txt(msg)
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
self.asr.put_audio_frame(audio_chunk)
def pause_talk(self):
self.tts.pause_talk()
self.asr.pause_talk()
def __mirror_index(self, index):
@ -243,6 +232,7 @@ class MuseReal(BaseReal):
except queue.Empty:
continue
if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据只需要取fullimg
self.speaking = False
audiotype = audio_frames[0][1]
if self.custom_index.get(audiotype) is not None: #有自定义视频
mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
@ -253,6 +243,7 @@ class MuseReal(BaseReal):
else:
combine_frame = self.frame_list_cycle[idx]
else:
self.speaking = True
bbox = self.coord_list_cycle[idx]
ori_frame = copy.deepcopy(self.frame_list_cycle[idx])
x1, y1, x2, y2 = bbox

View File

@ -126,17 +126,7 @@ class NeRFReal(BaseReal):
def __exit__(self, exc_type, exc_value, traceback):
if self.opt.asr:
self.asr.stop()
def put_msg_txt(self,msg):
self.tts.put_msg_txt(msg)
def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
self.asr.put_audio_frame(audio_chunk)
def pause_talk(self):
self.tts.pause_talk()
self.asr.pause_talk()
self.asr.stop()
# def mirror_index(self, index):
@ -200,6 +190,11 @@ class NeRFReal(BaseReal):
# # time.sleep(0.1)
# asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
#t = time.time()
if audiotype1!=0 and audiotype2!=0: #全为静音数据
self.speaking = False
else:
self.speaking = True
if audiotype1!=0 and audiotype2!=0 and self.custom_index.get(audiotype1) is not None: #不为推理视频并且有自定义视频
mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype1]),self.custom_index[audiotype1])
#imgindex = self.mirror_index(self.customimg_index)

View File

@ -12,10 +12,6 @@
<script src="pcm.js" charset="UTF-8"></script>
<h1>FunASR Demo</h1>
<h3>这里是FunASR开源项目体验demo集成了VAD、ASR与标点等工业级别的模型支持长音频离线文件转写实时语音识别等开源项目地址https://github.com/alibaba-damo-academy/FunASR</h3>
<div class="div_class_topArea">
<div class="div_class_recordControl">

View File

@ -51,12 +51,12 @@ var file_data_array; // array to save file data
var totalsend=0;
var now_ipaddress=window.location.href;
now_ipaddress=now_ipaddress.replace("https://","wss://");
now_ipaddress=now_ipaddress.replace("static/index.html","");
var localport=window.location.port;
now_ipaddress=now_ipaddress.replace(localport,"10095");
document.getElementById('wssip').value=now_ipaddress;
// var now_ipaddress=window.location.href;
// now_ipaddress=now_ipaddress.replace("https://","wss://");
// now_ipaddress=now_ipaddress.replace("static/index.html","");
// var localport=window.location.port;
// now_ipaddress=now_ipaddress.replace(localport,"10095");
// document.getElementById('wssip').value=now_ipaddress;
addresschange();
function addresschange()
{
@ -343,6 +343,43 @@ function handleWithTimestamp(tmptext,tmptime)
}
const sleep = (delay) => new Promise((resolve) => setTimeout(resolve, delay))
async function is_speaking() {
const response = await fetch('/is_speaking', {
body: JSON.stringify({
sessionid: 0,
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
const data = await response.json();
console.log('is_speaking res:',data)
return data.data
}
async function waitSpeakingEnd() {
rec.stop() //关闭录音
for(let i=0;i<10;i++) { //等待数字人开始讲话最长等待10s
bspeak = await is_speaking()
if(bspeak) {
break
}
await sleep(1000)
}
while(true) { //等待数字人讲话结束
bspeak = await is_speaking()
if(!bspeak) {
break
}
await sleep(1000)
}
await sleep(2000)
rec.start()
}
// 语音识别结果; 对jsonMsg数据解析,将识别结果附加到编辑框中
function getJsonMessage( jsonMsg ) {
//console.log(jsonMsg);
@ -353,9 +390,20 @@ function getJsonMessage( jsonMsg ) {
var timestamp=JSON.parse(jsonMsg.data)['timestamp'];
if(asrmodel=="2pass-offline" || asrmodel=="offline")
{
offline_text=offline_text+handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,"");
offline_text=offline_text+rectxt.replace(/ +/g,"")+'\n'; //handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,"");
rec_text=offline_text;
fetch('/human', {
body: JSON.stringify({
text: rectxt.replace(/ +/g,""),
type: 'echo',
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
waitSpeakingEnd();
}
else
{

136
web/rtcpushapi-asr.html Normal file
View File

@ -0,0 +1,136 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>WebRTC webcam</title>
<style>
button {
padding: 8px 16px;
}
video {
width: 100%;
}
.option {
margin-bottom: 8px;
}
#media {
max-width: 1280px;
}
</style>
</head>
<body>
<div class="option">
<input id="use-stun" type="checkbox"/>
<label for="use-stun">Use STUN server</label>
</div>
<button class="btn btn-primary" id="btn_play">Start</button>
<form class="form-inline" id="echo-form">
<div class="form-group">
<p>input text</p>
<textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
</div>
<button type="submit" class="btn btn-default">Send</button>
</form>
<div id="media">
<h2>Media</h2>
<video id="rtc_media_player" style="width:600px;" controls autoplay></video>
</div>
<iframe src="asr/index.html" width="600" height="500"></iframe>
<script src="srs.sdk.js"></script>
<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
</body>
<script type="text/javascript" charset="utf-8">
$(document).ready(function() {
// var host = window.location.hostname
// var ws = new WebSocket("ws://"+host+":8000/humanecho");
// //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
// ws.onopen = function() {
// console.log('Connected');
// };
// ws.onmessage = function(e) {
// console.log('Received: ' + e.data);
// data = e
// var vid = JSON.parse(data.data);
// console.log(typeof(vid),vid)
// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
// };
// ws.onclose = function(e) {
// console.log('Closed');
// };
$('#echo-form').on('submit', function(e) {
e.preventDefault();
var message = $('#message').val();
console.log('Sending: ' + message);
fetch('/human', {
body: JSON.stringify({
text: message,
type: 'echo',
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
//ws.send(message);
$('#message').val('');
});
});
$(function(){
var sdk = null; // Global handler to do cleanup when republishing.
var startPlay = function() {
$('#rtc_media_player').show();
// Close PC when user replay.
if (sdk) {
sdk.close();
}
sdk = new SrsRtcWhipWhepAsync();
// User should set the stream when publish is done, @see https://webrtc.org/getting-started/media-devices
// However SRS SDK provides a consist API like https://webrtc.org/getting-started/remote-streams
$('#rtc_media_player').prop('srcObject', sdk.stream);
// Optional callback, SDK will add track to stream.
// sdk.ontrack = function (event) { console.log('Got track', event); sdk.stream.addTrack(event.track); };
var host = window.location.hostname
// For example: webrtc://r.ossrs.net/live/livestream
var url = "http://"+host+":1985/rtc/v1/whep/?app=live&stream=livestream"
sdk.play(url).then(function(session){
//$('#sessionid').html(session.sessionid);
//$('#simulator-drop').attr('href', session.simulator + '?drop=1&username=' + session.sessionid);
}).catch(function (reason) {
sdk.close();
$('#rtc_media_player').hide();
console.error(reason);
});
};
$('#rtc_media_player').hide();
// var query = parse_query_string();
// srs_init_whep("#txt_url", query);
$("#btn_play").click(startPlay);
// Never play util windows loaded @see https://github.com/ossrs/srs/issues/2732
// if (query.autostart === 'true') {
// $('#rtc_media_player').prop('muted', true);
// console.warn('For autostart, we should mute it, see https://www.jianshu.com/p/c3c6944eed5a ' +
// 'or https://developers.google.com/web/updates/2017/09/autoplay-policy-changes#audiovideo_elements');
// window.addEventListener("load", function(){ startPlay(); });
// }
});
</script>
</html>

186
web/webrtcapi-asr.html Normal file
View File

@ -0,0 +1,186 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>WebRTC webcam</title>
<style>
button {
padding: 8px 16px;
}
video {
width: 100%;
}
.option {
margin-bottom: 8px;
}
#media {
max-width: 1280px;
}
</style>
</head>
<body>
<div class="option">
<input id="use-stun" type="checkbox"/>
<label for="use-stun">Use STUN server</label>
</div>
<button id="start" onclick="start()">Start</button>
<button id="stop" style="display: none" onclick="stop()">Stop</button>
<button class="btn btn-primary" id="btn_start_record">Start Recording</button>
<button class="btn btn-primary" id="btn_stop_record" disabled>Stop Recording</button>
<!-- <button class="btn btn-primary" id="btn_download">Download Video</button> -->
<input type="hidden" id="sessionid" value="0">
<form class="form-inline" id="echo-form">
<div class="form-group">
<p>input text</p>
<textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
</div>
<button type="submit" class="btn btn-default">Send</button>
</form>
<div id="media">
<h2>Media</h2>
<audio id="audio" autoplay="true"></audio>
<video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
</div>
<iframe src="asr/index.html" width="600" height="500"></iframe>
<script src="client.js"></script>
<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
</body>
<script type="text/javascript" charset="utf-8">
$(document).ready(function() {
// var host = window.location.hostname
// var ws = new WebSocket("ws://"+host+":8000/humanecho");
// //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
// ws.onopen = function() {
// console.log('Connected');
// };
// ws.onmessage = function(e) {
// console.log('Received: ' + e.data);
// data = e
// var vid = JSON.parse(data.data);
// console.log(typeof(vid),vid)
// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
// };
// ws.onclose = function(e) {
// console.log('Closed');
// };
$('#echo-form').on('submit', function(e) {
e.preventDefault();
var message = $('#message').val();
console.log('Sending: ' + message);
console.log('sessionid: ',document.getElementById('sessionid').value);
fetch('/human', {
body: JSON.stringify({
text: message,
type: 'echo',
interrupt: true,
sessionid:parseInt(document.getElementById('sessionid').value),
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
//ws.send(message);
$('#message').val('');
});
$('#btn_start_record').click(function() {
// 开始录制
console.log('Starting recording...');
fetch('/record', {
body: JSON.stringify({
type: 'start_record',
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
}).then(function(response) {
if (response.ok) {
console.log('Recording started.');
$('#btn_start_record').prop('disabled', true);
$('#btn_stop_record').prop('disabled', false);
// $('#btn_download').prop('disabled', true);
} else {
console.error('Failed to start recording.');
}
}).catch(function(error) {
console.error('Error:', error);
});
});
$('#btn_stop_record').click(function() {
// 结束录制
console.log('Stopping recording...');
fetch('/record', {
body: JSON.stringify({
type: 'end_record',
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
}).then(function(response) {
if (response.ok) {
console.log('Recording stopped.');
$('#btn_start_record').prop('disabled', false);
$('#btn_stop_record').prop('disabled', true);
// $('#btn_download').prop('disabled', false);
} else {
console.error('Failed to stop recording.');
}
}).catch(function(error) {
console.error('Error:', error);
});
});
// $('#btn_download').click(function() {
// // 下载视频文件
// console.log('Downloading video...');
// fetch('/record_lasted.mp4', {
// method: 'GET'
// }).then(function(response) {
// if (response.ok) {
// return response.blob();
// } else {
// throw new Error('Failed to download the video.');
// }
// }).then(function(blob) {
// // 创建一个 Blob 对象
// const url = window.URL.createObjectURL(blob);
// // 创建一个隐藏的可下载链接
// const a = document.createElement('a');
// a.style.display = 'none';
// a.href = url;
// a.download = 'record_lasted.mp4';
// document.body.appendChild(a);
// // 触发下载
// a.click();
// // 清理
// window.URL.revokeObjectURL(url);
// document.body.removeChild(a);
// console.log('Video downloaded successfully.');
// }).catch(function(error) {
// console.error('Error:', error);
// });
// });
});
</script>
</html>