From 71009f9f281060301471c87a526776e99ebb78a3 Mon Sep 17 00:00:00 2001
From: lipku <lipku@foxmail.com>
Date: Thu, 2 May 2024 20:32:28 +0800
Subject: [PATCH] default transport use rtcpush

---
 README.md            |  49 ++++++++---------
 app.py               |   6 +--
 nerfreal.py          |   4 +-
 web/rtcpushchat.html | 125 +++++++++++++++++++++++++++++++++++++++++++
 web/webrtcchat.html  |  83 ++++++++++++++++++++++++++++
 web/whep.js          |  75 ++++++++++++++++++++++++++
 webrtc.py            |  14 ++++-
 7 files changed, 323 insertions(+), 33 deletions(-)
 create mode 100644 web/rtcpushchat.html
 create mode 100644 web/webrtcchat.html
 create mode 100644 web/whep.js

diff --git a/README.md b/README.md
index ea078b7..3088727 100644
--- a/README.md
+++ b/README.md
@@ -26,15 +26,16 @@ pip install tensorflow-gpu==2.8.0
 ```
 linux cuda环境搭建可以参考这篇文章 https://zhuanlan.zhihu.com/p/674972886
 
-### 1.2 安装rtmpstream库  
-参照 https://github.com/lipku/python_rtmpstream
-
-
-## 2. Run
 
+## 2. Quick Start
+默认采用webrtc推流到srs  
 ### 2.1 运行rtmpserver (srs)
 ```
-docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5
+export CANDIDATE='<服务器外网ip>'
+docker run --rm --env CANDIDATE=$CANDIDATE \
+  -p 1935:1935 -p 8080:8080 -p 1985:1985 -p 8000:8000/udp \
+  registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5 \
+  objs/srs -c conf/rtc.conf
 ```
 
 ### 2.2 启动数字人：
@@ -48,16 +49,15 @@ python app.py
 export HF_ENDPOINT=https://hf-mirror.com
 ```
 
-运行成功后，用vlc访问rtmp://serverip/live/livestream  
-
-用浏览器打开http://serverip:8010/echo.html, 在文本框输入任意文字，提交。数字人播报该段文字  
+用浏览器打开http://serverip:8010/rtcpush.html, 在文本框输入任意文字，提交。数字人播报该段文字  
+备注：服务端需要开放端口 tcp:8000,8010,1985; udp:8000
 
 ## 3. More Usage
 ### 3.1 使用LLM模型进行数字人对话
 
 目前借鉴数字人对话系统[LinlyTalker](https://github.com/Kedreamix/Linly-Talker)的方式，LLM模型支持Chatgpt,Qwen和GeminiPro。需要在app.py中填入自己的api_key。    
 
-用浏览器打开http://serverip:8010/chat.html
+用浏览器打开http://serverip:8010/rtcpushchat.html
 
 ### 3.2 声音克隆
 可以任意选用下面两种服务，推荐用gpt-sovits
@@ -106,28 +106,26 @@ python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 10
 - --W、--H 训练视频的宽、高  
 - ernerf训练第三步torso如果训练的不好，在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs，torso不用模型推理，直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。
 
-### 3.6 webrtc
-#### 3.6.1 p2p模式
+### 3.6 webrtc p2p
 此种模式不需要srs
 ```
 python app.py --transport webrtc
 ```
 用浏览器打开http://serverip:8010/webrtc.html
 
-#### 3.6.2 通过srs一对多
-启动srs
+### 3.7 rtmp推送到srs
+- 安装rtmpstream库  
+参照 https://github.com/lipku/python_rtmpstream
+
+- 启动srs
 ```
-export CANDIDATE='<服务器外网ip>'
-docker run --rm --env CANDIDATE=$CANDIDATE \
-  -p 1935:1935 -p 8080:8080 -p 1985:1985 -p 8000:8000/udp \
-  registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5 \
-  objs/srs -c conf/rtc.conf
+docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5
 ```
-然后运行
+- 然后运行
+```python
+python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream'
 ```
-python app.py --transport rtcpush --push_url 'http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream'
-```
-用浏览器打开http://serverip:8010/rtcpush.html
+用浏览器打开http://serverip:8010/echo.html
   
 ## 4. Docker Run  
 不需要第1步的安装，直接运行。
@@ -159,10 +157,7 @@ docker版本已经不是最新代码，可以作为一个空环境，把最新
 整体延时3s左右  
 （1）tts延时1.7s左右，目前用的edgetts，需要将每句话转完后一次性输入，可以优化tts改成流式输入  
 （2）wav2vec延时0.4s，需要缓存18帧音频做计算 
-（3）srs转发延时，设置srs服务器减少缓冲延时。具体配置可看 https://ossrs.net/lts/zh-cn/docs/v5/doc/low-latency, 配置了一个低延时版本 
-```python
-docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/lipku/srs:v1.1
-```
+（3）srs转发延时，设置srs服务器减少缓冲延时。具体配置可看 https://ossrs.net/lts/zh-cn/docs/v5/doc/low-latency
 
 ## 8. TODO
 - [x] 添加chatgpt实现数字人对话
diff --git a/app.py b/app.py
index 2738379..e0c327f 100644
--- a/app.py
+++ b/app.py
@@ -372,8 +372,8 @@ if __name__ == '__main__':
     # parser.add_argument('--asr_model', type=str, default='facebook/wav2vec2-large-960h-lv60-self')
     # parser.add_argument('--asr_model', type=str, default='facebook/hubert-large-ls960-ft')
 
-    parser.add_argument('--transport', type=str, default='rtmp') #rtmp webrtc rtcpush
-    parser.add_argument('--push_url', type=str, default='rtmp://localhost/live/livestream') #http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream
+    parser.add_argument('--transport', type=str, default='rtcpush') #rtmp webrtc rtcpush
+    parser.add_argument('--push_url', type=str, default='http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream') #rtmp://localhost/live/livestream
 
     parser.add_argument('--asr_save_feats', action='store_true')
     # audio FPS
@@ -403,7 +403,7 @@ if __name__ == '__main__':
     tts_type = opt.tts
     if tts_type == "xtts":
         print("Computing the latents for a new reference...")
-        gspeaker = get_speaker(opt.REF_FILE, opt.tts_server)
+        gspeaker = get_speaker(opt.REF_FILE, opt.TTS_SERVER)
 
     # assert test mode
     opt.test = True
diff --git a/nerfreal.py b/nerfreal.py
index c4263a1..8f61539 100644
--- a/nerfreal.py
+++ b/nerfreal.py
@@ -11,7 +11,6 @@ import cv2
 
 from asrreal import ASR
 import asyncio
-from rtmp_streaming import StreamerConfig, Streamer
 from av import AudioFrame, VideoFrame
 
 class NeRFReal:
@@ -202,6 +201,7 @@ class NeRFReal:
         totaltime=0
 
         if self.opt.transport=='rtmp':
+            from rtmp_streaming import StreamerConfig, Streamer
             fps=25
             #push_url='rtmp://localhost/live/livestream' #'data/video/output_0.mp4'
             sc = StreamerConfig()
@@ -236,7 +236,7 @@ class NeRFReal:
             totaltime += (time.perf_counter() - t)
             count += 1
             if count==100:
-                print(f"------actual avg fps:{count/totaltime:.4f}")
+                print(f"------actual avg infer fps:{count/totaltime:.4f}")
                 count=0
                 totaltime=0
             delay = 0.04 - (time.perf_counter() - t) #40ms
diff --git a/web/rtcpushchat.html b/web/rtcpushchat.html
new file mode 100644
index 0000000..730541d
--- /dev/null
+++ b/web/rtcpushchat.html
@@ -0,0 +1,125 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>WebRTC webcam</title>
+    <style>
+    button {
+        padding: 8px 16px;
+    }
+
+    video {
+        width: 100%;
+    }
+
+    .option {
+        margin-bottom: 8px;
+    }
+
+    #media {
+        max-width: 1280px;
+    }
+    </style>
+</head>
+<body>
+
+<div class="option">
+    <input id="use-stun" type="checkbox"/>
+    <label for="use-stun">Use STUN server</label>
+</div>
+<button class="btn btn-primary" id="btn_play">Start</button>
+<form class="form-inline" id="echo-form">
+    <div class="form-group">
+      <p>input text</p>
+
+      <textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
+    </div>
+    <button type="submit" class="btn btn-default">Send</button>
+  </form>
+
+<div id="media">
+    <h2>Media</h2>
+
+    <video id="rtc_media_player" style="width:600px;" controls autoplay></video>
+</div>
+
+<script src="srs.sdk.js"></script>
+<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
+<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
+</body>
+<script type="text/javascript" charset="utf-8">
+
+	$(document).ready(function() {
+	  var host = window.location.hostname
+	  var ws = new WebSocket("ws://"+host+":8000/humanchat");
+	  //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  ws.onopen = function() {
+		console.log('Connected');
+	  };
+	  ws.onmessage = function(e) {
+		console.log('Received: ' + e.data);
+		data = e
+		var vid = JSON.parse(data.data); 
+		console.log(typeof(vid),vid)
+		//document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+		
+	  };
+	  ws.onclose = function(e) {
+		console.log('Closed');
+	  };
+
+	  $('#echo-form').on('submit', function(e) {
+		e.preventDefault();
+		var message = $('#message').val();
+		console.log('Sending: ' + message);
+		ws.send(message);
+		$('#message').val('');
+	  });
+	});
+
+  $(function(){
+    var sdk = null; // Global handler to do cleanup when republishing.
+    var startPlay = function() {
+        $('#rtc_media_player').show();
+
+        // Close PC when user replay.
+        if (sdk) {
+            sdk.close();
+        }
+        sdk = new SrsRtcWhipWhepAsync();
+
+        // User should set the stream when publish is done, @see https://webrtc.org/getting-started/media-devices
+        // However SRS SDK provides a consist API like https://webrtc.org/getting-started/remote-streams
+        $('#rtc_media_player').prop('srcObject', sdk.stream);
+        // Optional callback, SDK will add track to stream.
+        // sdk.ontrack = function (event) { console.log('Got track', event); sdk.stream.addTrack(event.track); };
+
+        var host = window.location.hostname
+        // For example: webrtc://r.ossrs.net/live/livestream
+        var url = "http://"+host+":1985/rtc/v1/whep/?app=live&stream=livestream"
+        sdk.play(url).then(function(session){
+            //$('#sessionid').html(session.sessionid);
+            //$('#simulator-drop').attr('href', session.simulator + '?drop=1&username=' + session.sessionid);
+        }).catch(function (reason) {
+            sdk.close();
+            $('#rtc_media_player').hide();
+            console.error(reason);
+        });
+    };
+
+    $('#rtc_media_player').hide();
+    // var query = parse_query_string();
+    // srs_init_whep("#txt_url", query);
+
+    $("#btn_play").click(startPlay);
+    // Never play util windows loaded @see https://github.com/ossrs/srs/issues/2732
+    // if (query.autostart === 'true') {
+    //     $('#rtc_media_player').prop('muted', true);
+    //     console.warn('For autostart, we should mute it, see https://www.jianshu.com/p/c3c6944eed5a ' +
+    //         'or https://developers.google.com/web/updates/2017/09/autoplay-policy-changes#audiovideo_elements');
+    //     window.addEventListener("load", function(){ startPlay(); });
+    // }
+});
+</script>
+</html>
diff --git a/web/webrtcchat.html b/web/webrtcchat.html
new file mode 100644
index 0000000..1b3f2e1
--- /dev/null
+++ b/web/webrtcchat.html
@@ -0,0 +1,83 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>WebRTC webcam</title>
+    <style>
+    button {
+        padding: 8px 16px;
+    }
+
+    video {
+        width: 100%;
+    }
+
+    .option {
+        margin-bottom: 8px;
+    }
+
+    #media {
+        max-width: 1280px;
+    }
+    </style>
+</head>
+<body>
+
+<div class="option">
+    <input id="use-stun" type="checkbox"/>
+    <label for="use-stun">Use STUN server</label>
+</div>
+<button id="start" onclick="start()">Start</button>
+<button id="stop" style="display: none" onclick="stop()">Stop</button>
+<form class="form-inline" id="echo-form">
+    <div class="form-group">
+      <p>input text</p>
+
+      <textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
+    </div>
+    <button type="submit" class="btn btn-default">Send</button>
+  </form>
+
+<div id="media">
+    <h2>Media</h2>
+
+    <audio id="audio" autoplay="true"></audio>
+    <video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
+</div>
+
+<script src="client.js"></script>
+<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
+<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
+</body>
+<script type="text/javascript" charset="utf-8">
+
+	$(document).ready(function() {
+	  var host = window.location.hostname
+	  var ws = new WebSocket("ws://"+host+":8000/humanchat");
+	  //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  ws.onopen = function() {
+		console.log('Connected');
+	  };
+	  ws.onmessage = function(e) {
+		console.log('Received: ' + e.data);
+		data = e
+		var vid = JSON.parse(data.data); 
+		console.log(typeof(vid),vid)
+		//document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+		
+	  };
+	  ws.onclose = function(e) {
+		console.log('Closed');
+	  };
+
+	  $('#echo-form').on('submit', function(e) {
+		e.preventDefault();
+		var message = $('#message').val();
+		console.log('Sending: ' + message);
+		ws.send(message);
+		$('#message').val('');
+	  });
+	});
+</script>
+</html>
diff --git a/web/whep.js b/web/whep.js
new file mode 100644
index 0000000..0020d47
--- /dev/null
+++ b/web/whep.js
@@ -0,0 +1,75 @@
+var pc = null;
+
+function negotiate() {
+    var host = window.location.hostname
+    pc.addTransceiver('video', { direction: 'recvonly' });
+    pc.addTransceiver('audio', { direction: 'recvonly' });
+    return pc.createOffer().then((offer) => {
+        return pc.setLocalDescription(offer);
+    }).then(() => {
+        // wait for ICE gathering to complete
+        return new Promise((resolve) => {
+            if (pc.iceGatheringState === 'complete') {
+                resolve();
+            } else {
+                const checkState = () => {
+                    if (pc.iceGatheringState === 'complete') {
+                        pc.removeEventListener('icegatheringstatechange', checkState);
+                        resolve();
+                    }
+                };
+                pc.addEventListener('icegatheringstatechange', checkState);
+            }
+        });
+    }).then(() => {
+        var offer = pc.localDescription;
+        return fetch("http://"+host+":1985/rtc/v1/whep/?app=live&stream=livestream", {
+            body: offer.sdp,
+            headers: {
+                'Content-Type': 'application/sdp'
+            },
+            method: 'POST'
+        });
+    }).then((response) => {
+        console.log(response)
+        return response.data;
+    }).then((answer) => {
+        return pc.setRemoteDescription({sdp:answer,type:'answer'});
+    }).catch((e) => {
+        alert(e);
+    });
+}
+
+function start() {
+    var config = {
+        sdpSemantics: 'unified-plan'
+    };
+
+    if (document.getElementById('use-stun').checked) {
+        config.iceServers = [{ urls: ['stun:stun.l.google.com:19302'] }];
+    }
+
+    pc = new RTCPeerConnection(config);
+
+    // connect audio / video
+    pc.addEventListener('track', (evt) => {
+        if (evt.track.kind == 'video') {
+            document.getElementById('video').srcObject = evt.streams[0];
+        } else {
+            document.getElementById('audio').srcObject = evt.streams[0];
+        }
+    });
+
+    document.getElementById('start').style.display = 'none';
+    negotiate();
+    document.getElementById('stop').style.display = 'inline-block';
+}
+
+function stop() {
+    document.getElementById('stop').style.display = 'none';
+
+    // close peer connection
+    setTimeout(() => {
+        pc.close();
+    }, 500);
+}
diff --git a/webrtc.py b/webrtc.py
index 6a2f371..4cca086 100644
--- a/webrtc.py
+++ b/webrtc.py
@@ -36,6 +36,10 @@ class PlayerStreamTrack(MediaStreamTrack):
         self.kind = kind
         self._player = player
         self._queue = asyncio.Queue()
+        if self.kind == 'video':
+            self.framecount = 0
+            self.lasttime = time.perf_counter()
+            self.totaltime = 0
     
     _start: float
     _timestamp: int
@@ -68,7 +72,7 @@ class PlayerStreamTrack(MediaStreamTrack):
             return self._timestamp, AUDIO_TIME_BASE
 
     async def recv(self) -> Union[Frame, Packet]:
-        # frame = self.frames[self.counter % 30]
+        # frame = self.frames[self.counter % 30]            
         self._player._start(self)
         frame = await self._queue.get()
         pts, time_base = await self.next_timestamp()
@@ -77,6 +81,14 @@ class PlayerStreamTrack(MediaStreamTrack):
         if frame is None:
             self.stop()
             raise Exception
+        if self.kind == 'video':
+            self.totaltime += (time.perf_counter() - self.lasttime)
+            self.framecount += 1
+            self.lasttime = time.perf_counter()
+            if self.framecount==100:
+                print(f"------actual avg final fps:{self.framecount/self.totaltime:.4f}")
+                self.framecount = 0
+                self.totaltime=0
         return frame
     
     def stop(self):