diff --git a/README.md b/README.md
index 0ec1d9f..b0a6c09 100644
--- a/README.md
+++ b/README.md
@@ -57,54 +57,37 @@ export HF_ENDPOINT=https://hf-mirror.com
 备注：服务端需要开放端口 tcp:8000,8010,1985; udp:8000
 
 ## 3. More Usage
-### 3.1 使用LLM模型进行数字人对话
+分别选择数字人模型、传输方式、tts模型
 
-目前借鉴数字人对话系统[LinlyTalker](https://github.com/Kedreamix/Linly-Talker)的方式，LLM模型支持Chatgpt,Qwen和GeminiPro。需要在app.py中填入自己的api_key。    
-
-用浏览器打开http://serverip:8010/rtcpushchat.html
-
-### 3.2 声音克隆
-可以任意选用下面两种服务，推荐用gpt-sovits
-#### 3.2.1 gpt-sovits
-服务部署参照[gpt-sovits](/tts/README.md)  
-运行
+### 3.1 数字人模型
+支持3种模型：ernerf、musetalk、wav2lip，默认用ernerf
+#### 3.1.1 ER-Nerf
 ```
-python app.py --tts gpt-sovits --TTS_SERVER http://127.0.0.1:9880 --REF_FILE data/ref.wav --REF_TEXT xxx
+python app.py --model ernerf
 ```
-REF_TEXT为REF_FILE中语音内容，时长不宜过长
-
-#### 3.2.2 xtts
-运行xtts服务，参照 https://github.com/coqui-ai/xtts-streaming-server
-```
-docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 9000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest
-```
-然后运行，其中ref.wav为需要克隆的声音文件
-```
-python app.py --tts xtts --REF_FILE data/ref.wav --TTS_SERVER http://localhost:9000
-```
-
-### 3.3 音频特征用hubert
-如果训练模型时用的hubert提取音频特征，用如下命令启动数字人
+支持如下参数配置
+##### 3.1.1.1 音频特征用hubert
+默认用的wav2lip，如果训练模型时用的hubert提取音频特征，用如下命令启动数字人
 ```
 python app.py --asr_model facebook/hubert-large-ls960-ft 
 ```
 
-### 3.4 设置背景图片
+##### 3.1.1.2 设置头部背景图片
 ```
 python app.py --bg_img bc.jpg 
 ```
 
-### 3.5 全身视频拼接
-#### 3.5.1 切割训练用的视频
+##### 3.1.1.3 全身视频贴回
+-  1.切割训练用的视频
 ```
 ffmpeg -i fullbody.mp4 -vf crop="400:400:100:5" train.mp4 
 ```
 用train.mp4训练模型
-#### 3.5.2 提取全身图片
+- 2.提取全身图片
 ```
 ffmpeg -i fullbody.mp4 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/fullbody/img/%d.jpg
 ```
-#### 3.5.2 启动数字人
+- 3.启动数字人
 ```
 python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 100 --fullbody_offset_y 5 --fullbody_width 580 --fullbody_height 1080 --W 400 --H 400
 ```
@@ -112,39 +95,7 @@ python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 10
 - --W、--H 训练视频的宽、高  
 - ernerf训练第三步torso如果训练的不好，在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs，torso不用模型推理，直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。
 
-### 3.6 不说话时用自定义视频替代
-- 提取自定义视频图片
-```
-ffmpeg -i silence.mp4 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/customvideo/img/%d.png
-```
-- 运行数字人
-```
-python app.py --customvideo --customvideo_img data/customvideo/img --customvideo_imgnum 100
-```
-
-### 3.7 webrtc p2p
-此种模式不需要srs
-```
-python app.py --transport webrtc
-```
-服务端需要开放端口 tcp:8010; udp:50000~60000  
-用浏览器打开http://serverip:8010/webrtcapi.html
-
-### 3.8 rtmp推送到srs
-- 安装rtmpstream库  
-参照 https://github.com/lipku/python_rtmpstream
-
-- 启动srs
-```
-docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5
-```
-- 运行数字人
-```python
-python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream'
-```
-用浏览器打开http://serverip:8010/echoapi.html
-
-### 3.9 模型用musetalk
+#### 3.1.2 模型用musetalk
 暂不支持rtmp推送
 - 安装依赖库
 ```bash
@@ -163,7 +114,7 @@ mim install "mmpose>=1.1.0"
 python app.py --model musetalk --transport webrtc  
 用浏览器打开http://serverip:8010/webrtcapi.html  
 可以设置--batch_size 提高显卡利用率，设置--avatar_id 运行不同的数字人
-#### 替换成自己的数字人
+##### 替换成自己的数字人
 ```bash
 git clone https://github.com/TMElyralab/MuseTalk.git
 cd MuseTalk
@@ -177,7 +128,7 @@ python simple_musetalk.py --avatar_id 4  --file D:\\ok\\test.mp4
 支持视频和图片生成 会自动生成到data的avatars目录下
 ```
 
-### 3.10 模型用wav2lip
+#### 3.1.3 模型用wav2lip
 暂不支持rtmp推送
 - 下载模型  
 下载wav2lip运行需要的模型，链接: https://pan.baidu.com/s/1yOsQ06-RIDTJd3HFCw4wtA  密码: ltua
@@ -187,12 +138,96 @@ python simple_musetalk.py --avatar_id 4  --file D:\\ok\\test.mp4
 python app.py --transport webrtc --model wav2lip --avatar_id wav2lip_avatar1  
 用浏览器打开http://serverip:8010/webrtcapi.html  
 可以设置--batch_size 提高显卡利用率，设置--avatar_id 运行不同的数字人
-#### 替换成自己的数字人
+##### 替换成自己的数字人
 ```bash
 cd wav2lip
 python genavatar.py --video_path xxx.mp4
 运行后将results/avatars下文件拷到本项目的data/avatars下
 ```
+
+### 3.2 传输模式
+支持webrtc、rtcpush、rtmp，默认用rtcpush
+#### 3.2.1 webrtc p2p
+此种模式不需要srs
+```
+python app.py --transport webrtc
+```
+服务端需要开放端口 tcp:8010; udp:50000~60000  
+用浏览器打开http://serverip:8010/webrtcapi.html
+
+#### 3.2.2 webrtc推送到srs
+- 启动srs
+```
+export CANDIDATE='<服务器外网ip>'
+docker run --rm --env CANDIDATE=$CANDIDATE \
+  -p 1935:1935 -p 8080:8080 -p 1985:1985 -p 8000:8000/udp \
+  registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5 \
+  objs/srs -c conf/rtc.conf
+```
+- 运行数字人
+```python
+python app.py --transport rtcpush --push_url 'http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream'
+```
+用浏览器打开http://serverip:8010/rtcpushapi.html
+
+#### 3.2.3 rtmp推送到srs
+- 安装rtmpstream库  
+参照 https://github.com/lipku/python_rtmpstream
+
+- 启动srs
+```
+docker run --rm -it -p 1935:1935 -p 1985:1985 -p 8080:8080 registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5
+```
+- 运行数字人
+```python
+python app.py --transport rtmp --push_url 'rtmp://localhost/live/livestream'
+```
+用浏览器打开http://serverip:8010/echoapi.html
+
+### 3.3 TTS模型
+支持edgetts、gpt-sovits、xtts，默认用edgetts
+#### 3.3.1 gpt-sovits
+服务部署参照[gpt-sovits](/tts/README.md)  
+运行
+```
+python app.py --tts gpt-sovits --TTS_SERVER http://127.0.0.1:9880 --REF_FILE data/ref.wav --REF_TEXT xxx
+```
+REF_TEXT为REF_FILE中语音内容，时长不宜过长
+
+#### 3.3.2 xtts
+运行xtts服务，参照 https://github.com/coqui-ai/xtts-streaming-server
+```
+docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 9000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest
+```
+然后运行，其中ref.wav为需要克隆的声音文件
+```
+python app.py --tts xtts --REF_FILE data/ref.wav --TTS_SERVER http://localhost:9000
+```
+
+### 3.4 视频编排
+- 1，生成素材
+```
+ffmpeg -i xxx.mp4 -s 576x768 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/customvideo/image/%08d.png
+ffmpeg -i xxx.mp4 -vn -acodec pcm_s16le -ac 1 -ar 16000 data/customvideo/audio.wav
+```
+其中-s与输出视频大小一致
+- 2，编辑data/custom_config.json
+指定imgpath和audiopath。
+设置audiotype，说明：0表示推理视频，不用设置；1表示静音视频，如果不设置默认用推理视频代替; 2以上自定义配置
+- 3，运行
+```
+python app.py --transport webrtc --customvideo_config data/custom_config.json
+```
+- 4，打开http://<serverip>:8010/webrtcapi-custom.html  
+填写custom_config.json中配置的audiotype，点击切换视频
+
+### 3.5 使用LLM模型进行数字人对话
+
+目前借鉴数字人对话系统[LinlyTalker](https://github.com/Kedreamix/Linly-Talker)的方式，LLM模型支持Chatgpt,Qwen和GeminiPro。需要在app.py中填入自己的api_key。    
+
+用浏览器打开http://serverip:8010/rtcpushchat.html
+
+
   
 ## 4. Docker Run  
 不需要前面的安装，直接运行。
@@ -206,7 +241,7 @@ https://www.codewithgpu.com/i/lipku/metahuman-stream/base
 [autodl教程](autodl/README.md)
 
 
-## 5. 数字人模型文件
+## 5. ernerf数字人模型文件
 可以替换成自己训练的模型(https://github.com/Fictionarry/ER-NeRF)
 ```python
 .
diff --git a/app.py b/app.py
index 9292cd0..c24adb8 100644
--- a/app.py
+++ b/app.py
@@ -316,9 +316,9 @@ if __name__ == '__main__':
     parser.add_argument('--bbox_shift', type=int, default=5)
     parser.add_argument('--batch_size', type=int, default=16)
 
-    parser.add_argument('--customvideo', action='store_true', help="custom video")
-    parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
-    parser.add_argument('--customvideo_imgnum', type=int, default=1)
+    # parser.add_argument('--customvideo', action='store_true', help="custom video")
+    # parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
+    # parser.add_argument('--customvideo_imgnum', type=int, default=1)
 
     parser.add_argument('--customvideo_config', type=str, default='')
 
diff --git a/basereal.py b/basereal.py
index 4a7f5de..fa4ee36 100644
--- a/basereal.py
+++ b/basereal.py
@@ -15,6 +15,8 @@ from threading import Thread, Event
 from io import BytesIO
 import soundfile as sf
 
+from ttsreal import EdgeTTS,VoitsTTS,XTTS
+
 from tqdm import tqdm
 def read_imgs(img_list):
     frames = []
@@ -30,6 +32,13 @@ class BaseReal:
         self.sample_rate = 16000
         self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000)
 
+        if opt.tts == "edgetts":
+            self.tts = EdgeTTS(opt,self)
+        elif opt.tts == "gpt-sovits":
+            self.tts = VoitsTTS(opt,self)
+        elif opt.tts == "xtts":
+            self.tts = XTTS(opt,self)
+
         self.curr_state=0
         self.custom_img_cycle = {}
         self.custom_audio_cycle = {}
@@ -48,7 +57,14 @@ class BaseReal:
             self.custom_audio_index[item['audiotype']] = 0
             self.custom_index[item['audiotype']] = 0
             self.custom_opt[item['audiotype']] = item
-    
+
+    def init_customindex(self):
+        self.curr_state=0
+        for key in self.custom_audio_index:
+            self.custom_audio_index[key]=0
+        for key in self.custom_index:
+            self.custom_index[key]=0
+
     def mirror_index(self,size, index):
         #size = len(self.coord_list_cycle)
         turn = index // size
@@ -62,11 +78,12 @@ class BaseReal:
         idx = self.custom_audio_index[audiotype]
         stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
         self.custom_audio_index[audiotype] += self.chunk
-        if self.custom_audio_index[audiotype]>=stream.shape[0]:
+        if self.custom_audio_index[audiotype]>=self.custom_audio_cycle[audiotype].shape[0]:
             self.curr_state = 1  #当前视频不循环播放，切换到静音状态
         return stream
     
     def set_curr_state(self,audiotype, reinit):
+        print('set_curr_state:',audiotype)
         self.curr_state = audiotype
         if reinit:
             self.custom_audio_index[audiotype] = 0
diff --git a/lipreal.py b/lipreal.py
index 460c43f..f4af934 100644
--- a/lipreal.py
+++ b/lipreal.py
@@ -166,12 +166,6 @@ class LipReal(BaseReal):
 
         self.asr = LipASR(opt,self)
         self.asr.warm_up()
-        if opt.tts == "edgetts":
-            self.tts = EdgeTTS(opt,self)
-        elif opt.tts == "gpt-sovits":
-            self.tts = VoitsTTS(opt,self)
-        elif opt.tts == "xtts":
-            self.tts = XTTS(opt,self)
         #self.__warm_up()
         
         self.render_event = mp.Event()
@@ -257,6 +251,7 @@ class LipReal(BaseReal):
         #     self.asr.warm_up()
 
         self.tts.render(quit_event)
+        self.init_customindex()
         process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track))
         process_thread.start()
 
diff --git a/museasr.py b/museasr.py
index cb166a6..c5c767c 100644
--- a/museasr.py
+++ b/museasr.py
@@ -8,8 +8,8 @@ from baseasr import BaseASR
 from musetalk.whisper.audio2feature import Audio2Feature
 
 class MuseASR(BaseASR):
-    def __init__(self, opt, audio_processor:Audio2Feature):
-        super().__init__(opt)
+    def __init__(self, opt, parent,audio_processor:Audio2Feature):
+        super().__init__(opt,parent)
         self.audio_processor = audio_processor
 
     def run_step(self):
diff --git a/musereal.py b/musereal.py
index 0f01dcf..9e3d3b9 100644
--- a/musereal.py
+++ b/musereal.py
@@ -27,6 +27,7 @@ from ttsreal import EdgeTTS,VoitsTTS,XTTS
 from museasr import MuseASR
 import asyncio
 from av import AudioFrame, VideoFrame
+from basereal import BaseReal
 
 from tqdm import tqdm
 def read_imgs(img_list):
@@ -125,9 +126,10 @@ def inference(render_event,batch_size,latents_out_path,audio_feat_queue,audio_ou
     print('musereal inference processor stop')
 
 @torch.no_grad()
-class MuseReal:
+class MuseReal(BaseReal):
     def __init__(self, opt):
-        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
+        super().__init__(opt)
+        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
         self.W = opt.W
         self.H = opt.H
 
@@ -156,14 +158,8 @@ class MuseReal:
         self.__loadmodels()
         self.__loadavatar()
 
-        self.asr = MuseASR(opt,self.audio_processor)
+        self.asr = MuseASR(opt,self,self.audio_processor)
         self.asr.warm_up()
-        if opt.tts == "edgetts":
-            self.tts = EdgeTTS(opt,self)
-        elif opt.tts == "gpt-sovits":
-            self.tts = VoitsTTS(opt,self)
-        elif opt.tts == "xtts":
-            self.tts = XTTS(opt,self)
         #self.__warm_up()
         
         self.render_event = mp.Event()
@@ -246,8 +242,16 @@ class MuseReal:
                 res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
             except queue.Empty:
                 continue
-            if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据，只需要取fullimg
-                combine_frame = self.frame_list_cycle[idx]
+            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
+                audiotype = audio_frames[0][1]
+                if self.custom_index.get(audiotype) is not None: #有自定义视频
+                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
+                    combine_frame = self.custom_img_cycle[audiotype][mirindex]
+                    self.custom_index[audiotype] += 1
+                    # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
+                    #     self.curr_state = 1  #当前视频不循环播放，切换到静音状态
+                else:
+                    combine_frame = self.frame_list_cycle[idx]
             else:
                 bbox = self.coord_list_cycle[idx]
                 ori_frame = copy.deepcopy(self.frame_list_cycle[idx])
@@ -283,6 +287,7 @@ class MuseReal:
         #     self.asr.warm_up()
 
         self.tts.render(quit_event)
+        self.init_customindex()
         process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track))
         process_thread.start()
 
diff --git a/asrreal.py b/nerfasr.py
similarity index 94%
rename from asrreal.py
rename to nerfasr.py
index 62aa15e..b74b199 100644
--- a/asrreal.py
+++ b/nerfasr.py
@@ -12,9 +12,9 @@ from threading import Thread, Event
 
 from baseasr import BaseASR
 
-class ASR(BaseASR):
-    def __init__(self, opt):
-        super().__init__(opt)
+class NerfASR(BaseASR):
+    def __init__(self, opt, parent):
+        super().__init__(opt,parent)
 
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         if 'esperanto' in self.opt.asr_model:
@@ -66,8 +66,12 @@ class ASR(BaseASR):
             type = 0
             #print(f'[INFO] get frame {frame.shape}')
         except queue.Empty:
-            frame = np.zeros(self.chunk, dtype=np.float32)
-            type = 1
+            if self.parent and self.parent.curr_state>1: #播放自定义音频
+                frame = self.parent.get_audio_stream(self.parent.curr_state)
+                type = self.parent.curr_state
+            else:
+                frame = np.zeros(self.chunk, dtype=np.float32)
+                type = 1
 
         return frame,type
 
diff --git a/nerfreal.py b/nerfreal.py
index ef04c3e..a3cc372 100644
--- a/nerfreal.py
+++ b/nerfreal.py
@@ -9,15 +9,17 @@ import time
 import torch.nn.functional as F
 import cv2
 
-from asrreal import ASR
+from nerfasr import NerfASR
 from ttsreal import EdgeTTS,VoitsTTS,XTTS
 
 import asyncio
 from av import AudioFrame, VideoFrame
+from basereal import BaseReal
 
-class NeRFReal:
+class NeRFReal(BaseReal):
     def __init__(self, opt, trainer, data_loader, debug=True):
-        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
+        super().__init__(opt)
+        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
         self.W = opt.W
         self.H = opt.H
 
@@ -55,17 +57,11 @@ class NeRFReal:
         #self.ind_index = 0
         #self.ind_num = trainer.model.individual_codes.shape[0]
 
-        self.customimg_index = 0
+        #self.customimg_index = 0
 
         # build asr
-        self.asr = ASR(opt)
+        self.asr = NerfASR(opt,self)
         self.asr.warm_up()
-        if opt.tts == "edgetts":
-            self.tts = EdgeTTS(opt,self)
-        elif opt.tts == "gpt-sovits":
-            self.tts = VoitsTTS(opt,self)
-        elif opt.tts == "xtts":
-            self.tts = XTTS(opt,self)
         
         '''
         video_path = 'video_stream'
@@ -124,14 +120,14 @@ class NeRFReal:
         self.asr.pause_talk()   
     
 
-    def mirror_index(self, index):
-        size = self.opt.customvideo_imgnum
-        turn = index // size
-        res = index % size
-        if turn % 2 == 0:
-            return res
-        else:
-            return size - res - 1   
+    # def mirror_index(self, index):
+    #     size = self.opt.customvideo_imgnum
+    #     turn = index // size
+    #     res = index % size
+    #     if turn % 2 == 0:
+    #         return res
+    #     else:
+    #         return size - res - 1   
 
     def test_step(self,loop=None,audio_track=None,video_track=None):
         
@@ -148,39 +144,57 @@ class NeRFReal:
             # use the live audio stream
             data['auds'] = self.asr.get_next_feat()
 
-        audiotype = 0
-        if self.opt.transport=='rtmp':
-            for _ in range(2):
-                frame,type = self.asr.get_audio_out()
-                audiotype += type
-                #print(f'[INFO] get_audio_out shape ',frame.shape)                
+        audiotype1 = 0
+        audiotype2 = 0
+        #send audio
+        for i in range(2):
+            frame,type = self.asr.get_audio_out()
+            if i==0:
+                audiotype1 = type
+            else:
+                audiotype2 = type
+            #print(f'[INFO] get_audio_out shape ',frame.shape)
+            if self.opt.transport=='rtmp':                
                 self.streamer.stream_frame_audio(frame)
-        else:
-            for _ in range(2):
-                frame,type = self.asr.get_audio_out()
-                audiotype += type
+            else: #webrtc
                 frame = (frame * 32767).astype(np.int16)
                 new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
                 new_frame.planes[0].update(frame.tobytes())
                 new_frame.sample_rate=16000
-                # if audio_track._queue.qsize()>10:
-                #     time.sleep(0.1)
-                asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)  
+                asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)
+
+        # if self.opt.transport=='rtmp':
+        #     for _ in range(2):
+        #         frame,type = self.asr.get_audio_out()
+        #         audiotype += type
+        #         #print(f'[INFO] get_audio_out shape ',frame.shape)                
+        #         self.streamer.stream_frame_audio(frame)
+        # else: #webrtc
+        #     for _ in range(2):
+        #         frame,type = self.asr.get_audio_out()
+        #         audiotype += type
+        #         frame = (frame * 32767).astype(np.int16)
+        #         new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
+        #         new_frame.planes[0].update(frame.tobytes())
+        #         new_frame.sample_rate=16000
+        #         # if audio_track._queue.qsize()>10:
+        #         #     time.sleep(0.1)
+        #         asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)  
         #t = time.time()
-        if self.opt.customvideo and audiotype!=0:
-            self.loader = iter(self.data_loader) #init
-            imgindex  = self.mirror_index(self.customimg_index)
+        if audiotype1!=0 and audiotype2!=0 and self.custom_index.get(audiotype1) is not None: #不为推理视频并且有自定义视频
+            mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype1]),self.custom_index[audiotype1])
+            #imgindex  = self.mirror_index(self.customimg_index)
             #print('custom img index:',imgindex)
-            image = cv2.imread(os.path.join(self.opt.customvideo_img, str(int(imgindex))+'.png'))
+            #image = cv2.imread(os.path.join(self.opt.customvideo_img, str(int(imgindex))+'.png'))
+            image = self.custom_img_cycle[audiotype1][mirindex]
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            self.custom_index[audiotype1] += 1
             if self.opt.transport=='rtmp':
                 self.streamer.stream_frame(image)
             else:
                 new_frame = VideoFrame.from_ndarray(image, format="rgb24")
                 asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
-            self.customimg_index += 1
-        else:
-            self.customimg_index = 0
+        else: #推理视频+贴回
             outputs = self.trainer.test_gui_with_data(data, self.W, self.H)
             #print('-------ernerf time: ',time.time()-t)
             #print(f'[INFO] outputs shape ',outputs['image'].shape)
@@ -213,6 +227,8 @@ class NeRFReal:
         #if self.opt.asr:
         #     self.asr.warm_up()
         
+        self.init_customindex()
+
         if self.opt.transport=='rtmp':
             from rtmp_streaming import StreamerConfig, Streamer
             fps=25
diff --git a/web/webrtcapi-custom.html b/web/webrtcapi-custom.html
index eaef394..87d9ab4 100644
--- a/web/webrtcapi-custom.html
+++ b/web/webrtcapi-custom.html
@@ -54,7 +54,20 @@
 <script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
 </body>
 <script type="text/javascript" charset="utf-8">
-
+  function custom() {
+      fetch('/set_audiotype', {
+            body: JSON.stringify({
+                audiotype: parseInt(document.getElementById('audiotype').value),
+                reinit: false,
+                sessionid:parseInt(document.getElementById('sessionid').value),
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+    }
+    
 	$(document).ready(function() {
 	  // var host = window.location.hostname
 	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
@@ -94,20 +107,6 @@
       //ws.send(message);
       $('#message').val('');
 	  });
-
-    function custom() {
-      fetch('/set_audiotype', {
-            body: JSON.stringify({
-                audiotype: parseInt(document.getElementById('audiotype').value),
-                reinit: false,
-                sessionid:parseInt(document.getElementById('sessionid').value),
-            }),
-            headers: {
-                'Content-Type': 'application/json'
-            },
-            method: 'POST'
-      });
-    }
 	});
 </script>
 </html>