support custom video in silence
This commit is contained in:
parent
6978f89ec2
commit
6a1f2e4f48
19
README.md
19
README.md
|
@ -9,6 +9,7 @@ A streaming digital human based on the Ernerf model, realize audio video synch
|
||||||
3. 支持多种音频特征驱动:wav2vec、hubert
|
3. 支持多种音频特征驱动:wav2vec、hubert
|
||||||
4. 支持全身视频拼接
|
4. 支持全身视频拼接
|
||||||
5. 支持rtmp和webrtc
|
5. 支持rtmp和webrtc
|
||||||
|
6. 支持视频编排:不说话时播放自定义视频
|
||||||
|
|
||||||
## 1. Installation
|
## 1. Installation
|
||||||
|
|
||||||
|
@ -106,14 +107,24 @@ python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 10
|
||||||
- --W、--H 训练视频的宽、高
|
- --W、--H 训练视频的宽、高
|
||||||
- ernerf训练第三步torso如果训练的不好,在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs,torso不用模型推理,直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。
|
- ernerf训练第三步torso如果训练的不好,在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs,torso不用模型推理,直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。
|
||||||
|
|
||||||
### 3.6 webrtc p2p
|
### 3.6 不说话时用自定义视频替代
|
||||||
|
- 提取自定义视频图片
|
||||||
|
```
|
||||||
|
ffmpeg -i silence.mp4 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/customvideo/img/%d.png
|
||||||
|
```
|
||||||
|
- 运行数字人
|
||||||
|
```
|
||||||
|
python app.py --customvideo --customvideo_img data/customvideo/img --customvideo_imgnum 100
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.7 webrtc p2p
|
||||||
此种模式不需要srs
|
此种模式不需要srs
|
||||||
```
|
```
|
||||||
python app.py --transport webrtc
|
python app.py --transport webrtc
|
||||||
```
|
```
|
||||||
用浏览器打开http://serverip:8010/webrtc.html
|
用浏览器打开http://serverip:8010/webrtc.html
|
||||||
|
|
||||||
### 3.7 rtmp推送到srs
|
### 3.8 rtmp推送到srs
|
||||||
- 安装rtmpstream库
|
- 安装rtmpstream库
|
||||||
参照 https://github.com/lipku/python_rtmpstream
|
参照 https://github.com/lipku/python_rtmpstream
|
||||||
|
|
||||||
|
@ -121,7 +132,7 @@ python app.py --transport webrtc
|
||||||
```
|
```
|
||||||
docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5
|
docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5
|
||||||
```
|
```
|
||||||
- 然后运行
|
- 运行数字人
|
||||||
```python
|
```python
|
||||||
python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream'
|
python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream'
|
||||||
```
|
```
|
||||||
|
@ -162,7 +173,7 @@ docker版本已经不是最新代码,可以作为一个空环境,把最新
|
||||||
## 8. TODO
|
## 8. TODO
|
||||||
- [x] 添加chatgpt实现数字人对话
|
- [x] 添加chatgpt实现数字人对话
|
||||||
- [x] 声音克隆
|
- [x] 声音克隆
|
||||||
- [ ] 数字人静音时用一段视频代替
|
- [x] 数字人静音时用一段视频代替
|
||||||
|
|
||||||
如果本项目对你有帮助,帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。
|
如果本项目对你有帮助,帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。
|
||||||
Email: lipku@foxmail.com
|
Email: lipku@foxmail.com
|
||||||
|
|
4
app.py
4
app.py
|
@ -390,6 +390,10 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--fullbody_offset_x', type=int, default=0)
|
parser.add_argument('--fullbody_offset_x', type=int, default=0)
|
||||||
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
||||||
|
|
||||||
|
parser.add_argument('--customvideo', action='store_true', help="custom video")
|
||||||
|
parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
|
||||||
|
parser.add_argument('--customvideo_imgnum', type=int, default=1)
|
||||||
|
|
||||||
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
||||||
parser.add_argument('--REF_FILE', type=str, default=None)
|
parser.add_argument('--REF_FILE', type=str, default=None)
|
||||||
parser.add_argument('--TTS_SERVER', type=str, default='http://localhost:9000') #http://127.0.0.1:5000
|
parser.add_argument('--TTS_SERVER', type=str, default='http://localhost:9000') #http://127.0.0.1:5000
|
||||||
|
|
14
asrreal.py
14
asrreal.py
|
@ -171,7 +171,7 @@ class ASR:
|
||||||
return
|
return
|
||||||
|
|
||||||
# get a frame of audio
|
# get a frame of audio
|
||||||
frame = self.__get_audio_frame()
|
frame,type = self.__get_audio_frame()
|
||||||
|
|
||||||
# the last frame
|
# the last frame
|
||||||
if frame is None:
|
if frame is None:
|
||||||
|
@ -180,7 +180,7 @@ class ASR:
|
||||||
else:
|
else:
|
||||||
self.frames.append(frame)
|
self.frames.append(frame)
|
||||||
# put to output
|
# put to output
|
||||||
self.output_queue.put(frame)
|
self.output_queue.put((frame,type))
|
||||||
# context not enough, do not run network.
|
# context not enough, do not run network.
|
||||||
if len(self.frames) < self.stride_left_size + self.context_size + self.stride_right_size:
|
if len(self.frames) < self.stride_left_size + self.context_size + self.stride_right_size:
|
||||||
return
|
return
|
||||||
|
@ -236,25 +236,27 @@ class ASR:
|
||||||
|
|
||||||
def __get_audio_frame(self):
|
def __get_audio_frame(self):
|
||||||
if self.inwarm: # warm up
|
if self.inwarm: # warm up
|
||||||
return np.zeros(self.chunk, dtype=np.float32)
|
return np.zeros(self.chunk, dtype=np.float32),1
|
||||||
|
|
||||||
if self.mode == 'file':
|
if self.mode == 'file':
|
||||||
if self.idx < self.file_stream.shape[0]:
|
if self.idx < self.file_stream.shape[0]:
|
||||||
frame = self.file_stream[self.idx: self.idx + self.chunk]
|
frame = self.file_stream[self.idx: self.idx + self.chunk]
|
||||||
self.idx = self.idx + self.chunk
|
self.idx = self.idx + self.chunk
|
||||||
return frame
|
return frame,0
|
||||||
else:
|
else:
|
||||||
return None
|
return None,0
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
frame = self.queue.get(block=False)
|
frame = self.queue.get(block=False)
|
||||||
|
type = 0
|
||||||
print(f'[INFO] get frame {frame.shape}')
|
print(f'[INFO] get frame {frame.shape}')
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
frame = np.zeros(self.chunk, dtype=np.float32)
|
frame = np.zeros(self.chunk, dtype=np.float32)
|
||||||
|
type = 1
|
||||||
|
|
||||||
self.idx = self.idx + self.chunk
|
self.idx = self.idx + self.chunk
|
||||||
|
|
||||||
return frame
|
return frame,type
|
||||||
|
|
||||||
|
|
||||||
def __frame_to_text(self, frame):
|
def __frame_to_text(self, frame):
|
||||||
|
|
100
nerfreal.py
100
nerfreal.py
|
@ -57,6 +57,8 @@ class NeRFReal:
|
||||||
self.ind_index = 0
|
self.ind_index = 0
|
||||||
self.ind_num = trainer.model.individual_codes.shape[0]
|
self.ind_num = trainer.model.individual_codes.shape[0]
|
||||||
|
|
||||||
|
self.customimg_index = 0
|
||||||
|
|
||||||
# build asr
|
# build asr
|
||||||
if self.opt.asr:
|
if self.opt.asr:
|
||||||
self.asr = ASR(opt)
|
self.asr = ASR(opt)
|
||||||
|
@ -112,7 +114,16 @@ class NeRFReal:
|
||||||
self.asr.push_audio(chunk)
|
self.asr.push_audio(chunk)
|
||||||
|
|
||||||
def before_push_audio(self):
|
def before_push_audio(self):
|
||||||
self.asr.before_push_audio()
|
self.asr.before_push_audio()
|
||||||
|
|
||||||
|
def mirror_index(self, index):
|
||||||
|
size = self.opt.customvideo_imgnum
|
||||||
|
turn = index // size
|
||||||
|
res = index % size
|
||||||
|
if turn % 2 == 0:
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
return size - res - 1
|
||||||
|
|
||||||
def prepare_buffer(self, outputs):
|
def prepare_buffer(self, outputs):
|
||||||
if self.mode == 'image':
|
if self.mode == 'image':
|
||||||
|
@ -136,53 +147,60 @@ class NeRFReal:
|
||||||
# use the live audio stream
|
# use the live audio stream
|
||||||
data['auds'] = self.asr.get_next_feat()
|
data['auds'] = self.asr.get_next_feat()
|
||||||
|
|
||||||
|
audiotype = 0
|
||||||
|
if self.opt.transport=='rtmp':
|
||||||
|
for _ in range(2):
|
||||||
|
frame,type = self.asr.get_audio_out()
|
||||||
|
audiotype += type
|
||||||
|
#print(f'[INFO] get_audio_out shape ',frame.shape)
|
||||||
|
self.streamer.stream_frame_audio(frame)
|
||||||
|
else:
|
||||||
|
for _ in range(2):
|
||||||
|
frame,type = self.asr.get_audio_out()
|
||||||
|
audiotype += type
|
||||||
|
frame = (frame * 32767).astype(np.int16)
|
||||||
|
new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
|
||||||
|
new_frame.planes[0].update(frame.tobytes())
|
||||||
|
new_frame.sample_rate=16000
|
||||||
|
asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
|
||||||
#t = time.time()
|
#t = time.time()
|
||||||
outputs = self.trainer.test_gui_with_data(data, self.W, self.H)
|
if self.opt.customvideo and audiotype!=0:
|
||||||
#print('-------ernerf time: ',time.time()-t)
|
self.loader = iter(self.data_loader) #init
|
||||||
#print(f'[INFO] outputs shape ',outputs['image'].shape)
|
imgindex = self.mirror_index(self.customimg_index)
|
||||||
image = (outputs['image'] * 255).astype(np.uint8)
|
#print('custom img index:',imgindex)
|
||||||
if not self.opt.fullbody:
|
image = cv2.imread(os.path.join(self.opt.customvideo_img, str(int(imgindex))+'.png'))
|
||||||
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||||
if self.opt.transport=='rtmp':
|
if self.opt.transport=='rtmp':
|
||||||
self.streamer.stream_frame(image)
|
self.streamer.stream_frame(image)
|
||||||
else:
|
else:
|
||||||
new_frame = VideoFrame.from_ndarray(image, format="rgb24")
|
new_frame = VideoFrame.from_ndarray(image, format="rgb24")
|
||||||
asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
|
asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
|
||||||
else: #fullbody human
|
self.customimg_index += 1
|
||||||
#print("frame index:",data['index'])
|
|
||||||
image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
|
|
||||||
image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
|
|
||||||
start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标
|
|
||||||
start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标
|
|
||||||
image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
|
|
||||||
if self.opt.transport=='rtmp':
|
|
||||||
self.streamer.stream_frame(image_fullbody)
|
|
||||||
else:
|
|
||||||
new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24")
|
|
||||||
asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
|
|
||||||
#self.pipe.stdin.write(image.tostring())
|
|
||||||
if self.opt.transport=='rtmp':
|
|
||||||
for _ in range(2):
|
|
||||||
frame = self.asr.get_audio_out()
|
|
||||||
#print(f'[INFO] get_audio_out shape ',frame.shape)
|
|
||||||
self.streamer.stream_frame_audio(frame)
|
|
||||||
else:
|
else:
|
||||||
for _ in range(2):
|
self.customimg_index = 0
|
||||||
frame = self.asr.get_audio_out()
|
outputs = self.trainer.test_gui_with_data(data, self.W, self.H)
|
||||||
frame = (frame * 32767).astype(np.int16)
|
#print('-------ernerf time: ',time.time()-t)
|
||||||
new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
|
#print(f'[INFO] outputs shape ',outputs['image'].shape)
|
||||||
new_frame.planes[0].update(frame.tobytes())
|
image = (outputs['image'] * 255).astype(np.uint8)
|
||||||
new_frame.sample_rate=16000
|
if not self.opt.fullbody:
|
||||||
asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
|
if self.opt.transport=='rtmp':
|
||||||
# frame1 = self.asr.get_audio_out()
|
self.streamer.stream_frame(image)
|
||||||
# frame2 = self.asr.get_audio_out()
|
else:
|
||||||
# frame = np.concatenate((frame1,frame2))
|
new_frame = VideoFrame.from_ndarray(image, format="rgb24")
|
||||||
# frame = (frame * 32767).astype(np.int16)
|
asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
|
||||||
# new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
|
else: #fullbody human
|
||||||
# new_frame.planes[0].update(frame.tobytes())
|
#print("frame index:",data['index'])
|
||||||
# new_frame.sample_rate=16000
|
image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
|
||||||
# asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
|
image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
|
||||||
# frame = (frame * 32767).astype(np.int16).tobytes()
|
start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标
|
||||||
# self.fifo_audio.write(frame)
|
start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标
|
||||||
|
image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
|
||||||
|
if self.opt.transport=='rtmp':
|
||||||
|
self.streamer.stream_frame(image_fullbody)
|
||||||
|
else:
|
||||||
|
new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24")
|
||||||
|
asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
|
||||||
|
#self.pipe.stdin.write(image.tostring())
|
||||||
else:
|
else:
|
||||||
if self.audio_features is not None:
|
if self.audio_features is not None:
|
||||||
auds = get_audio_features(self.audio_features, self.opt.att, self.audio_idx)
|
auds = get_audio_features(self.audio_features, self.opt.att, self.audio_idx)
|
||||||
|
|
Loading…
Reference in New Issue