From 6a1f2e4f48538cfd2d308f185332846e499c00f9 Mon Sep 17 00:00:00 2001 From: lipku Date: Sat, 4 May 2024 10:10:41 +0800 Subject: [PATCH] support custom video in silence --- README.md | 19 +++++++--- app.py | 4 +++ asrreal.py | 14 ++++---- nerfreal.py | 100 +++++++++++++++++++++++++++++++--------------------- 4 files changed, 86 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 3088727..4b085bc 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ A streaming digital human based on the Ernerf model, realize audio video synch 3. 支持多种音频特征驱动:wav2vec、hubert 4. 支持全身视频拼接 5. 支持rtmp和webrtc +6. 支持视频编排:不说话时播放自定义视频 ## 1. Installation @@ -106,14 +107,24 @@ python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 10 - --W、--H 训练视频的宽、高 - ernerf训练第三步torso如果训练的不好,在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs,torso不用模型推理,直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。 -### 3.6 webrtc p2p +### 3.6 不说话时用自定义视频替代 +- 提取自定义视频图片 +``` +ffmpeg -i silence.mp4 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/customvideo/img/%d.png +``` +- 运行数字人 +``` +python app.py --customvideo --customvideo_img data/customvideo/img --customvideo_imgnum 100 +``` + +### 3.7 webrtc p2p 此种模式不需要srs ``` python app.py --transport webrtc ``` 用浏览器打开http://serverip:8010/webrtc.html -### 3.7 rtmp推送到srs +### 3.8 rtmp推送到srs - 安装rtmpstream库 参照 https://github.com/lipku/python_rtmpstream @@ -121,7 +132,7 @@ python app.py --transport webrtc ``` docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5 ``` -- 然后运行 +- 运行数字人 ```python python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream' ``` @@ -162,7 +173,7 @@ docker版本已经不是最新代码,可以作为一个空环境,把最新 ## 8. TODO - [x] 添加chatgpt实现数字人对话 - [x] 声音克隆 -- [ ] 数字人静音时用一段视频代替 +- [x] 数字人静音时用一段视频代替 如果本项目对你有帮助,帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。 Email: lipku@foxmail.com diff --git a/app.py b/app.py index 2785379..543ef9a 100644 --- a/app.py +++ b/app.py @@ -390,6 +390,10 @@ if __name__ == '__main__': parser.add_argument('--fullbody_offset_x', type=int, default=0) parser.add_argument('--fullbody_offset_y', type=int, default=0) + parser.add_argument('--customvideo', action='store_true', help="custom video") + parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img') + parser.add_argument('--customvideo_imgnum', type=int, default=1) + parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits parser.add_argument('--REF_FILE', type=str, default=None) parser.add_argument('--TTS_SERVER', type=str, default='http://localhost:9000') #http://127.0.0.1:5000 diff --git a/asrreal.py b/asrreal.py index 457feeb..75d0f49 100644 --- a/asrreal.py +++ b/asrreal.py @@ -171,7 +171,7 @@ class ASR: return # get a frame of audio - frame = self.__get_audio_frame() + frame,type = self.__get_audio_frame() # the last frame if frame is None: @@ -180,7 +180,7 @@ class ASR: else: self.frames.append(frame) # put to output - self.output_queue.put(frame) + self.output_queue.put((frame,type)) # context not enough, do not run network. if len(self.frames) < self.stride_left_size + self.context_size + self.stride_right_size: return @@ -236,25 +236,27 @@ class ASR: def __get_audio_frame(self): if self.inwarm: # warm up - return np.zeros(self.chunk, dtype=np.float32) + return np.zeros(self.chunk, dtype=np.float32),1 if self.mode == 'file': if self.idx < self.file_stream.shape[0]: frame = self.file_stream[self.idx: self.idx + self.chunk] self.idx = self.idx + self.chunk - return frame + return frame,0 else: - return None + return None,0 else: try: frame = self.queue.get(block=False) + type = 0 print(f'[INFO] get frame {frame.shape}') except queue.Empty: frame = np.zeros(self.chunk, dtype=np.float32) + type = 1 self.idx = self.idx + self.chunk - return frame + return frame,type def __frame_to_text(self, frame): diff --git a/nerfreal.py b/nerfreal.py index 8f61539..074219d 100644 --- a/nerfreal.py +++ b/nerfreal.py @@ -57,6 +57,8 @@ class NeRFReal: self.ind_index = 0 self.ind_num = trainer.model.individual_codes.shape[0] + self.customimg_index = 0 + # build asr if self.opt.asr: self.asr = ASR(opt) @@ -112,7 +114,16 @@ class NeRFReal: self.asr.push_audio(chunk) def before_push_audio(self): - self.asr.before_push_audio() + self.asr.before_push_audio() + + def mirror_index(self, index): + size = self.opt.customvideo_imgnum + turn = index // size + res = index % size + if turn % 2 == 0: + return res + else: + return size - res - 1 def prepare_buffer(self, outputs): if self.mode == 'image': @@ -136,53 +147,60 @@ class NeRFReal: # use the live audio stream data['auds'] = self.asr.get_next_feat() + audiotype = 0 + if self.opt.transport=='rtmp': + for _ in range(2): + frame,type = self.asr.get_audio_out() + audiotype += type + #print(f'[INFO] get_audio_out shape ',frame.shape) + self.streamer.stream_frame_audio(frame) + else: + for _ in range(2): + frame,type = self.asr.get_audio_out() + audiotype += type + frame = (frame * 32767).astype(np.int16) + new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) + new_frame.planes[0].update(frame.tobytes()) + new_frame.sample_rate=16000 + asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop) #t = time.time() - outputs = self.trainer.test_gui_with_data(data, self.W, self.H) - #print('-------ernerf time: ',time.time()-t) - #print(f'[INFO] outputs shape ',outputs['image'].shape) - image = (outputs['image'] * 255).astype(np.uint8) - if not self.opt.fullbody: + if self.opt.customvideo and audiotype!=0: + self.loader = iter(self.data_loader) #init + imgindex = self.mirror_index(self.customimg_index) + #print('custom img index:',imgindex) + image = cv2.imread(os.path.join(self.opt.customvideo_img, str(int(imgindex))+'.png')) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if self.opt.transport=='rtmp': self.streamer.stream_frame(image) else: new_frame = VideoFrame.from_ndarray(image, format="rgb24") asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop) - else: #fullbody human - #print("frame index:",data['index']) - image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg')) - image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB) - start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标 - start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标 - image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image - if self.opt.transport=='rtmp': - self.streamer.stream_frame(image_fullbody) - else: - new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24") - asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop) - #self.pipe.stdin.write(image.tostring()) - if self.opt.transport=='rtmp': - for _ in range(2): - frame = self.asr.get_audio_out() - #print(f'[INFO] get_audio_out shape ',frame.shape) - self.streamer.stream_frame_audio(frame) + self.customimg_index += 1 else: - for _ in range(2): - frame = self.asr.get_audio_out() - frame = (frame * 32767).astype(np.int16) - new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) - new_frame.planes[0].update(frame.tobytes()) - new_frame.sample_rate=16000 - asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop) - # frame1 = self.asr.get_audio_out() - # frame2 = self.asr.get_audio_out() - # frame = np.concatenate((frame1,frame2)) - # frame = (frame * 32767).astype(np.int16) - # new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) - # new_frame.planes[0].update(frame.tobytes()) - # new_frame.sample_rate=16000 - # asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop) - # frame = (frame * 32767).astype(np.int16).tobytes() - # self.fifo_audio.write(frame) + self.customimg_index = 0 + outputs = self.trainer.test_gui_with_data(data, self.W, self.H) + #print('-------ernerf time: ',time.time()-t) + #print(f'[INFO] outputs shape ',outputs['image'].shape) + image = (outputs['image'] * 255).astype(np.uint8) + if not self.opt.fullbody: + if self.opt.transport=='rtmp': + self.streamer.stream_frame(image) + else: + new_frame = VideoFrame.from_ndarray(image, format="rgb24") + asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop) + else: #fullbody human + #print("frame index:",data['index']) + image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg')) + image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB) + start_x = self.opt.fullbody_offset_x # 合并后小图片的起始x坐标 + start_y = self.opt.fullbody_offset_y # 合并后小图片的起始y坐标 + image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image + if self.opt.transport=='rtmp': + self.streamer.stream_frame(image_fullbody) + else: + new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24") + asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop) + #self.pipe.stdin.write(image.tostring()) else: if self.audio_features is not None: auds = get_audio_features(self.audio_features, self.opt.att, self.audio_idx)