diff --git a/README.md b/README.md
index 3088727..4b085bc 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ A streaming digital human based on the Ernerf model， realize audio video synch
 3. 支持多种音频特征驱动：wav2vec、hubert
 4. 支持全身视频拼接
 5. 支持rtmp和webrtc
+6. 支持视频编排：不说话时播放自定义视频
 
 ## 1. Installation
 
@@ -106,14 +107,24 @@ python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 10
 - --W、--H 训练视频的宽、高  
 - ernerf训练第三步torso如果训练的不好，在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs，torso不用模型推理，直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。
 
-### 3.6 webrtc p2p
+### 3.6 不说话时用自定义视频替代
+- 提取自定义视频图片
+```
+ffmpeg -i silence.mp4 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/customvideo/img/%d.png
+```
+- 运行数字人
+```
+python app.py --customvideo --customvideo_img data/customvideo/img --customvideo_imgnum 100
+```
+
+### 3.7 webrtc p2p
 此种模式不需要srs
 ```
 python app.py --transport webrtc
 ```
 用浏览器打开http://serverip:8010/webrtc.html
 
-### 3.7 rtmp推送到srs
+### 3.8 rtmp推送到srs
 - 安装rtmpstream库  
 参照 https://github.com/lipku/python_rtmpstream
 
@@ -121,7 +132,7 @@ python app.py --transport webrtc
 ```
 docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5
 ```
-- 然后运行
+- 运行数字人
 ```python
 python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream'
 ```
@@ -162,7 +173,7 @@ docker版本已经不是最新代码，可以作为一个空环境，把最新
 ## 8. TODO
 - [x] 添加chatgpt实现数字人对话
 - [x] 声音克隆
-- [ ] 数字人静音时用一段视频代替
+- [x] 数字人静音时用一段视频代替
 
 如果本项目对你有帮助，帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目。  
 Email: lipku@foxmail.com  
diff --git a/app.py b/app.py
index 2785379..543ef9a 100644
--- a/app.py
+++ b/app.py
@@ -390,6 +390,10 @@ if __name__ == '__main__':
     parser.add_argument('--fullbody_offset_x', type=int, default=0)
     parser.add_argument('--fullbody_offset_y', type=int, default=0)
 
+    parser.add_argument('--customvideo', action='store_true', help="custom video")
+    parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
+    parser.add_argument('--customvideo_imgnum', type=int, default=1)
+
     parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
     parser.add_argument('--REF_FILE', type=str, default=None)
     parser.add_argument('--TTS_SERVER', type=str, default='http://localhost:9000') #http://127.0.0.1:5000
diff --git a/asrreal.py b/asrreal.py
index 457feeb..75d0f49 100644
--- a/asrreal.py
+++ b/asrreal.py
@@ -171,7 +171,7 @@ class ASR:
             return
 
         # get a frame of audio
-        frame = self.__get_audio_frame()
+        frame,type = self.__get_audio_frame()
         
         # the last frame
         if frame is None:
@@ -180,7 +180,7 @@ class ASR:
         else:
             self.frames.append(frame)
             # put to output
-            self.output_queue.put(frame)
+            self.output_queue.put((frame,type))
             # context not enough, do not run network.
             if len(self.frames) < self.stride_left_size + self.context_size + self.stride_right_size:
                 return
@@ -236,25 +236,27 @@ class ASR:
     
     def __get_audio_frame(self): 
         if self.inwarm: # warm up
-            return np.zeros(self.chunk, dtype=np.float32)
+            return np.zeros(self.chunk, dtype=np.float32),1
         
         if self.mode == 'file':
             if self.idx < self.file_stream.shape[0]:
                 frame = self.file_stream[self.idx: self.idx + self.chunk]
                 self.idx = self.idx + self.chunk
-                return frame
+                return frame,0
             else:
-                return None        
+                return None,0        
         else:
             try:
                 frame = self.queue.get(block=False)
+                type = 0
                 print(f'[INFO] get frame {frame.shape}')
             except queue.Empty:
                 frame = np.zeros(self.chunk, dtype=np.float32)
+                type = 1
 
             self.idx = self.idx + self.chunk
 
-            return frame
+            return frame,type
 
         
     def __frame_to_text(self, frame):
diff --git a/nerfreal.py b/nerfreal.py
index 8f61539..074219d 100644
--- a/nerfreal.py
+++ b/nerfreal.py
@@ -57,6 +57,8 @@ class NeRFReal:
         self.ind_index = 0
         self.ind_num = trainer.model.individual_codes.shape[0]
 
+        self.customimg_index = 0
+
         # build asr
         if self.opt.asr:
             self.asr = ASR(opt)
@@ -112,7 +114,16 @@ class NeRFReal:
         self.asr.push_audio(chunk)   
     
     def before_push_audio(self):
-        self.asr.before_push_audio()   
+        self.asr.before_push_audio()
+
+    def mirror_index(self, index):
+        size = self.opt.customvideo_imgnum
+        turn = index // size
+        res = index % size
+        if turn % 2 == 0:
+            return res
+        else:
+            return size - res - 1   
 
     def prepare_buffer(self, outputs):
         if self.mode == 'image':
@@ -136,53 +147,60 @@ class NeRFReal:
                 # use the live audio stream
                 data['auds'] = self.asr.get_next_feat()
 
+            audiotype = 0
+            if self.opt.transport=='rtmp':
+                for _ in range(2):
+                    frame,type = self.asr.get_audio_out()
+                    audiotype += type
+                    #print(f'[INFO] get_audio_out shape ',frame.shape)                
+                    self.streamer.stream_frame_audio(frame)
+            else:
+                for _ in range(2):
+                    frame,type = self.asr.get_audio_out()
+                    audiotype += type
+                    frame = (frame * 32767).astype(np.int16)
+                    new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
+                    new_frame.planes[0].update(frame.tobytes())
+                    new_frame.sample_rate=16000
+                    asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)  
             #t = time.time()
-            outputs = self.trainer.test_gui_with_data(data, self.W, self.H)
-            #print('-------ernerf time: ',time.time()-t)
-            #print(f'[INFO] outputs shape ',outputs['image'].shape)
-            image = (outputs['image'] * 255).astype(np.uint8)
-            if not self.opt.fullbody:
+            if self.opt.customvideo and audiotype!=0:
+                self.loader = iter(self.data_loader) #init
+                imgindex  = self.mirror_index(self.customimg_index)
+                #print('custom img index:',imgindex)
+                image = cv2.imread(os.path.join(self.opt.customvideo_img, str(int(imgindex))+'.png'))
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                 if self.opt.transport=='rtmp':
                     self.streamer.stream_frame(image)
                 else:
                     new_frame = VideoFrame.from_ndarray(image, format="rgb24")
                     asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
-            else: #fullbody human
-                #print("frame index:",data['index'])
-                image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
-                image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
-                start_x = self.opt.fullbody_offset_x  # 合并后小图片的起始x坐标
-                start_y = self.opt.fullbody_offset_y  # 合并后小图片的起始y坐标
-                image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
-                if self.opt.transport=='rtmp':
-                    self.streamer.stream_frame(image_fullbody)
-                else:
-                    new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24")
-                    asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
-            #self.pipe.stdin.write(image.tostring())
-            if self.opt.transport=='rtmp':
-                for _ in range(2):
-                    frame = self.asr.get_audio_out()
-                    #print(f'[INFO] get_audio_out shape ',frame.shape)                
-                    self.streamer.stream_frame_audio(frame)
+                self.customimg_index += 1
             else:
-                for _ in range(2):
-                    frame = self.asr.get_audio_out()
-                    frame = (frame * 32767).astype(np.int16)
-                    new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
-                    new_frame.planes[0].update(frame.tobytes())
-                    new_frame.sample_rate=16000
-                    asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
-                # frame1 = self.asr.get_audio_out()
-                # frame2 = self.asr.get_audio_out()
-                # frame = np.concatenate((frame1,frame2))
-                # frame = (frame * 32767).astype(np.int16)
-                # new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
-                # new_frame.planes[0].update(frame.tobytes())
-                # new_frame.sample_rate=16000
-                # asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
-            #     frame = (frame * 32767).astype(np.int16).tobytes()
-            #     self.fifo_audio.write(frame)           
+                self.customimg_index = 0
+                outputs = self.trainer.test_gui_with_data(data, self.W, self.H)
+                #print('-------ernerf time: ',time.time()-t)
+                #print(f'[INFO] outputs shape ',outputs['image'].shape)
+                image = (outputs['image'] * 255).astype(np.uint8)
+                if not self.opt.fullbody:
+                    if self.opt.transport=='rtmp':
+                        self.streamer.stream_frame(image)
+                    else:
+                        new_frame = VideoFrame.from_ndarray(image, format="rgb24")
+                        asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
+                else: #fullbody human
+                    #print("frame index:",data['index'])
+                    image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
+                    image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
+                    start_x = self.opt.fullbody_offset_x  # 合并后小图片的起始x坐标
+                    start_y = self.opt.fullbody_offset_y  # 合并后小图片的起始y坐标
+                    image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
+                    if self.opt.transport=='rtmp':
+                        self.streamer.stream_frame(image_fullbody)
+                    else:
+                        new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24")
+                        asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
+            #self.pipe.stdin.write(image.tostring())        
         else:
             if self.audio_features is not None:
                 auds = get_audio_features(self.audio_features, self.opt.att, self.audio_idx)