diff --git a/app.py b/app.py
index 4f3c255..9292cd0 100644
--- a/app.py
+++ b/app.py
@@ -140,7 +140,7 @@ async def human(request):
     if params['type']=='echo':
         nerfreals[sessionid].put_msg_txt(params['text'])
     elif params['type']=='chat':
-        res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text']))                          
+        res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text']))                         
         nerfreals[sessionid].put_msg_txt(res)
 
     return web.Response(
@@ -150,6 +150,19 @@ async def human(request):
         ),
     )
 
+async def set_audiotype(request):
+    params = await request.json()
+
+    sessionid = params.get('sessionid',0)    
+    nerfreals[sessionid].set_curr_state(params['audiotype'],params['reinit'])
+
+    return web.Response(
+        content_type="application/json",
+        text=json.dumps(
+            {"code": 0, "data":"ok"}
+        ),
+    )
+
 async def on_shutdown(app):
     # close peer connections
     coros = [pc.close() for pc in pcs]
@@ -307,6 +320,8 @@ if __name__ == '__main__':
     parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
     parser.add_argument('--customvideo_imgnum', type=int, default=1)
 
+    parser.add_argument('--customvideo_config', type=str, default='')
+
     parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
     parser.add_argument('--REF_FILE', type=str, default=None)
     parser.add_argument('--REF_TEXT', type=str, default=None)
@@ -325,6 +340,10 @@ if __name__ == '__main__':
     opt = parser.parse_args()
     #app.config.from_object(opt)
     #print(app.config)
+    opt.customopt = []
+    if opt.customvideo_config!='':
+        with open(opt.customvideo_config,'r') as file:
+            opt.customopt = json.load(file)
 
     if opt.model == 'ernerf':
         from ernerf.nerf_triplane.provider import NeRFDataset_Test
@@ -402,6 +421,7 @@ if __name__ == '__main__':
     appasync.on_shutdown.append(on_shutdown)
     appasync.router.add_post("/offer", offer)
     appasync.router.add_post("/human", human)
+    appasync.router.add_post("/set_audiotype", set_audiotype)
     appasync.router.add_static('/',path='web')
 
     # Configure default CORS settings.
diff --git a/baseasr.py b/baseasr.py
index df66873..d58170c 100644
--- a/baseasr.py
+++ b/baseasr.py
@@ -7,8 +7,9 @@ import multiprocessing as mp
 
 
 class BaseASR:
-    def __init__(self, opt):
+    def __init__(self, opt, parent=None):
         self.opt = opt
+        self.parent = parent
 
         self.fps = opt.fps # 20 ms per frame
         self.sample_rate = 16000
@@ -38,8 +39,12 @@ class BaseASR:
             type = 0
             #print(f'[INFO] get frame {frame.shape}')
         except queue.Empty:
-            frame = np.zeros(self.chunk, dtype=np.float32)
-            type = 1
+            if self.parent and self.parent.curr_state>1: #播放自定义音频
+                frame = self.parent.get_audio_stream(self.parent.curr_state)
+                type = self.parent.curr_state
+            else:
+                frame = np.zeros(self.chunk, dtype=np.float32)
+                type = 1
 
         return frame,type 
 
diff --git a/basereal.py b/basereal.py
new file mode 100644
index 0000000..4a7f5de
--- /dev/null
+++ b/basereal.py
@@ -0,0 +1,81 @@
+import math
+import torch
+import numpy as np
+
+import os
+import time
+import cv2
+import glob
+import pickle
+import copy
+
+import queue
+from queue import Queue
+from threading import Thread, Event
+from io import BytesIO
+import soundfile as sf
+
+from tqdm import tqdm
+def read_imgs(img_list):
+    frames = []
+    print('reading images...')
+    for img_path in tqdm(img_list):
+        frame = cv2.imread(img_path)
+        frames.append(frame)
+    return frames
+
+class BaseReal:
+    def __init__(self, opt):
+        self.opt = opt
+        self.sample_rate = 16000
+        self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000)
+
+        self.curr_state=0
+        self.custom_img_cycle = {}
+        self.custom_audio_cycle = {}
+        self.custom_audio_index = {}
+        self.custom_index = {}
+        self.custom_opt = {}
+        self.__loadcustom()
+    
+    def __loadcustom(self):
+        for item in self.opt.customopt:
+            print(item)
+            input_img_list = glob.glob(os.path.join(item['imgpath'], '*.[jpJP][pnPN]*[gG]'))
+            input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
+            self.custom_img_cycle[item['audiotype']] = read_imgs(input_img_list)
+            self.custom_audio_cycle[item['audiotype']], sample_rate = sf.read(item['audiopath'], dtype='float32')
+            self.custom_audio_index[item['audiotype']] = 0
+            self.custom_index[item['audiotype']] = 0
+            self.custom_opt[item['audiotype']] = item
+    
+    def mirror_index(self,size, index):
+        #size = len(self.coord_list_cycle)
+        turn = index // size
+        res = index % size
+        if turn % 2 == 0:
+            return res
+        else:
+            return size - res - 1 
+    
+    def get_audio_stream(self,audiotype):
+        idx = self.custom_audio_index[audiotype]
+        stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
+        self.custom_audio_index[audiotype] += self.chunk
+        if self.custom_audio_index[audiotype]>=stream.shape[0]:
+            self.curr_state = 1  #当前视频不循环播放，切换到静音状态
+        return stream
+    
+    def set_curr_state(self,audiotype, reinit):
+        self.curr_state = audiotype
+        if reinit:
+            self.custom_audio_index[audiotype] = 0
+            self.custom_index[audiotype] = 0
+    
+    # def process_custom(self,audiotype:int,idx:int):
+    #     if self.curr_state!=audiotype: #从推理切到口播
+    #         if idx in self.switch_pos:  #在卡点位置可以切换
+    #             self.curr_state=audiotype
+    #             self.custom_index=0
+    #     else:
+    #         self.custom_index+=1
\ No newline at end of file
diff --git a/data/custom_config.json b/data/custom_config.json
new file mode 100644
index 0000000..9fc54a7
--- /dev/null
+++ b/data/custom_config.json
@@ -0,0 +1,7 @@
+[
+   {
+        "audiotype":2, 
+        "imgpath":"data/customvideo/image", 
+        "audiopath":"data/customvideo/audio.wav"
+    }
+]
\ No newline at end of file
diff --git a/lipreal.py b/lipreal.py
index 9461e7b..460c43f 100644
--- a/lipreal.py
+++ b/lipreal.py
@@ -23,8 +23,8 @@ from ttsreal import EdgeTTS,VoitsTTS,XTTS
 from lipasr import LipASR
 import asyncio
 from av import AudioFrame, VideoFrame
-
 from wav2lip.models import Wav2Lip
+from basereal import BaseReal
 
 from tqdm import tqdm
 
@@ -143,9 +143,10 @@ def inference(render_event,batch_size,face_imgs_path,audio_feat_queue,audio_out_
     print('musereal inference processor stop')
 
 @torch.no_grad()
-class LipReal:
+class LipReal(BaseReal):
     def __init__(self, opt):
-        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
+        super().__init__(opt)
+        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
         self.W = opt.W
         self.H = opt.H
 
@@ -163,7 +164,7 @@ class LipReal:
         #self.__loadmodels()
         self.__loadavatar()
 
-        self.asr = LipASR(opt)
+        self.asr = LipASR(opt,self)
         self.asr.warm_up()
         if opt.tts == "edgetts":
             self.tts = EdgeTTS(opt,self)
@@ -213,8 +214,16 @@ class LipReal:
                 res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
             except queue.Empty:
                 continue
-            if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据，只需要取fullimg
-                combine_frame = self.frame_list_cycle[idx]
+            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
+                audiotype = audio_frames[0][1]
+                if self.custom_index.get(audiotype) is not None: #有自定义视频
+                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
+                    combine_frame = self.custom_img_cycle[audiotype][mirindex]
+                    self.custom_index[audiotype] += 1
+                    # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
+                    #     self.curr_state = 1  #当前视频不循环播放，切换到静音状态
+                else:
+                    combine_frame = self.frame_list_cycle[idx]
             else:
                 bbox = self.coord_list_cycle[idx]
                 combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
diff --git a/llm/VllmGPT.py b/llm/VllmGPT.py
index a8db6aa..b5ae5ec 100644
--- a/llm/VllmGPT.py
+++ b/llm/VllmGPT.py
@@ -15,7 +15,7 @@ class VllmGPT:
         self.__URL = "http://{}:{}/v1/completions".format(self.host, self.port)
         self.__URL2 = "http://{}:{}/v1/chat/completions".format(self.host, self.port)
 
-    def question(self,cont):
+    def chat(self,cont):
         chat_list = []
         # contentdb = content_db.new_instance()
         # list = contentdb.get_list('all','desc',11)
@@ -77,5 +77,5 @@ class VllmGPT:
     
 if __name__ == "__main__":
     vllm = VllmGPT('192.168.1.3','8101')
-    req = vllm.question("你叫什么名字啊今年多大了")
+    req = vllm.chat("你叫什么名字啊今年多大了")
     print(req)
diff --git a/web/webrtcapi-custom.html b/web/webrtcapi-custom.html
new file mode 100644
index 0000000..eaef394
--- /dev/null
+++ b/web/webrtcapi-custom.html
@@ -0,0 +1,113 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>WebRTC webcam</title>
+    <style>
+    button {
+        padding: 8px 16px;
+    }
+
+    video {
+        width: 100%;
+    }
+
+    .option {
+        margin-bottom: 8px;
+    }
+
+    #media {
+        max-width: 1280px;
+    }
+    </style>
+</head>
+<body>
+
+<div class="option">
+    <input id="use-stun" type="checkbox"/>
+    <label for="use-stun">Use STUN server</label>
+</div>
+<button id="start" onclick="start()">Start</button>
+<button id="stop" style="display: none" onclick="stop()">Stop</button>
+<input type="hidden" id="sessionid" value="0">
+<form class="form-inline" id="echo-form">
+    <div class="form-group">
+      <p>input text</p>
+
+      <textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
+    </div>
+    <button type="submit" class="btn btn-default">Send</button>
+  </form>
+
+<div id="media">
+    <h2>Media</h2>
+
+    <audio id="audio" autoplay="true"></audio>
+    <video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
+</div>
+<button id="custom" onclick="custom()">切换视频</button>
+<input type="text" id="audiotype" value="0">
+
+<script src="client.js"></script>
+<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
+<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
+</body>
+<script type="text/javascript" charset="utf-8">
+
+	$(document).ready(function() {
+	  // var host = window.location.hostname
+	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
+	  // //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  // ws.onopen = function() {
+		// console.log('Connected');
+	  // };
+	  // ws.onmessage = function(e) {
+		// console.log('Received: ' + e.data);
+		// data = e
+		// var vid = JSON.parse(data.data); 
+		// console.log(typeof(vid),vid)
+		// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+		
+	  // };
+	  // ws.onclose = function(e) {
+		// console.log('Closed');
+	  // };
+
+	  $('#echo-form').on('submit', function(e) {
+      e.preventDefault();
+      var message = $('#message').val();
+      console.log('Sending: ' + message);
+      console.log('sessionid: ',document.getElementById('sessionid').value);
+      fetch('/human', {
+            body: JSON.stringify({
+                text: message,
+                type: 'echo',
+                interrupt: true,
+                sessionid:parseInt(document.getElementById('sessionid').value),
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+      //ws.send(message);
+      $('#message').val('');
+	  });
+
+    function custom() {
+      fetch('/set_audiotype', {
+            body: JSON.stringify({
+                audiotype: parseInt(document.getElementById('audiotype').value),
+                reinit: false,
+                sessionid:parseInt(document.getElementById('sessionid').value),
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+    }
+	});
+</script>
+</html>
diff --git a/web/webrtcapi.html b/web/webrtcapi.html
index 7f874a9..af269d4 100644
--- a/web/webrtcapi.html
+++ b/web/webrtcapi.html
@@ -30,7 +30,7 @@
 </div>
 <button id="start" onclick="start()">Start</button>
 <button id="stop" style="display: none" onclick="stop()">Stop</button>
-<input type="hidden" id="sessionid" value="1234">
+<input type="hidden" id="sessionid" value="0">
 <form class="form-inline" id="echo-form">
     <div class="form-group">
       <p>input text</p>