feat: 完善修改成自动绝对路径,添加接口生成

2024-07-04 09:43:56 +08:00 · 2024-07-04 09:43:56 +08:00 · cd7d5f31b5
parent 18d7db35a7
commit cd7d5f31b5
14 changed files with 401 additions and 553 deletions
--- a/README.md
+++ b/README.md
@ -6,11 +6,10 @@ Real time interactive streaming digital human， realize audio video synchronous
 ## Features
 1. 支持多种数字人模型: ernerf、musetalk、wav2lip
 2. 支持声音克隆
-3. 支持多种音频特征驱动：wav2vec、hubert
+3. 支持数字人说话被打断
 4. 支持全身视频拼接
 5. 支持rtmp和webrtc
 6. 支持视频编排：不说话时播放自定义视频
-7. 支持大模型对话

 ## 1. Installation

@ -171,13 +170,11 @@ cd MuseTalk
 修改configs/inference/realtime.yaml，将preparation改为True
 python -m scripts.realtime_inference --inference_config configs/inference/realtime.yaml
 运行后将results/avatars下文件拷到本项目的data/avatars下
-```
-
-```bash
-也可以试用本地目录下的 simple_musetalk.py
+方法二
+执行
 cd musetalk 
-python simple_musetalk.py --avatar_id 2  --file D:\\ok\\test.mp4
-运行后将直接生成在data/avatars下
+python simple_musetalk.py --avatar_id 4  --file D:\\ok\\test.mp4
+支持视频和图片生成 会自动生成到data的avatars目录下
 ```

 ### 3.10 模型用wav2lip
@ -185,7 +182,7 @@ python simple_musetalk.py --avatar_id 2  --file D:\\ok\\test.mp4
 - 下载模型  
 下载wav2lip运行需要的模型，网盘地址 https://drive.uc.cn/s/551be97d7cfa4
 将s3fd.pth拷到本项目wav2lip/face_detection/detection/sfd/s3fd.pth, 将wav2lip.pth拷到本项目的models下  
-数字人模型文件 wav2lip_avatar1.tar.gz, 解压后将整个文件夹拷到本项目的data/avatars下
+数字人模型文件 wav2lip_avatar1.tar.gz，网盘地址 https://drive.uc.cn/s/5bd0cde0b0774, 解压后将整个文件夹拷到本项目的data/avatars下
 - 运行  
 python app.py --transport webrtc --model wav2lip --avatar_id wav2lip_avatar1  
 用浏览器打开http://serverip:8010/webrtcapi.html  
--- a/app.py
+++ b/app.py
@ -1,22 +1,30 @@
 # server.py
-import argparse
-import asyncio
-import json
-import multiprocessing
-from threading import Thread, Event
-
-import aiohttp
-import aiohttp_cors
-from aiohttp import web
-from aiortc import RTCPeerConnection, RTCSessionDescription
-from flask import Flask
+from flask import Flask, render_template,send_from_directory,request, jsonify
 from flask_sockets import Sockets
+import base64
+import time
+import json
+import gevent
 from gevent import pywsgi
 from geventwebsocket.handler import WebSocketHandler
+import os
+import re
+import numpy as np
+from threading import Thread,Event
+import multiprocessing

-from musetalk.simple_musetalk import create_musetalk_human
+from aiohttp import web
+import aiohttp
+import aiohttp_cors
+from aiortc import RTCPeerConnection, RTCSessionDescription
 from webrtc import HumanPlayer

+import argparse
+
+import shutil
+import asyncio
+
+
 app = Flask(__name__)
 sockets = Sockets(app)
 global nerfreal
@ -51,7 +59,6 @@ def llm_response(message):
    print(response)
    return response

-
@sockets.route('/humanchat')
 def chat_socket(ws):
    # 获取WebSocket对象
@ -72,11 +79,9 @@ def chat_socket(ws):
                res=llm_response(message)                           
                nerfreal.put_msg_txt(res)

-
 #####webrtc###############################
 pcs = set()

-
 #@app.route('/offer', methods=['POST'])
 async def offer(request):
    params = await request.json()
@ -110,10 +115,12 @@ async def offer(request):
        ),
    )

-
 async def human(request):
    params = await request.json()

+    if params.get('interrupt'):
+        nerfreal.pause_talk()
+
    if params['type']=='echo':
        nerfreal.put_msg_txt(params['text'])
    elif params['type']=='chat':
@ -127,35 +134,12 @@ async def human(request):
        ),
    )

-
-async def handle_create_musetalk(request):
-    reader = await request.multipart()
-    # 处理文件部分
-    file_part = await reader.next()
-    filename = file_part.filename
-    file_data = await file_part.read()  # 读取文件的内容
-    # 注意：确保这个文件路径是可写的
-    with open(filename, 'wb') as f:
-        f.write(file_data)
-    # 处理整数部分
-    part = await reader.next()
-    avatar_id = int(await part.text())
-    create_musetalk_human(filename, avatar_id)
-    os.remove(filename)
-    return web.json_response({
-        'status': 'success',
-        'filename': filename,
-        'int_value': avatar_id,
-    })
-
-
 async def on_shutdown(app):
    # close peer connections
    coros = [pc.close() for pc in pcs]
    await asyncio.gather(*coros)
    pcs.clear()

-
 async def post(url,data):
    try:
        async with aiohttp.ClientSession() as session:
@ -164,7 +148,6 @@ async def post(url, data):
    except aiohttp.ClientError as e:
        print(f'Error: {e}')

-
 async def run(push_url):
    pc = RTCPeerConnection()
    pcs.add(pc)
@ -183,8 +166,6 @@ async def run(push_url):
    await pc.setLocalDescription(await pc.createOffer())
    answer = await post(push_url,pc.localDescription.sdp)
    await pc.setRemoteDescription(RTCSessionDescription(sdp=answer,type='answer'))
-
-
 ##########################################
 # os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
 # os.environ['MULTIPROCESSING_METHOD'] = 'forkserver'                                                    
@ -204,19 +185,13 @@ if __name__ == '__main__':
    ### training options
    parser.add_argument('--ckpt', type=str, default='data/pretrained/ngp_kf.pth')
   
-    parser.add_argument('--num_rays', type=int, default=4096 * 16,
-                        help="num rays sampled per image for each training step")
+    parser.add_argument('--num_rays', type=int, default=4096 * 16, help="num rays sampled per image for each training step")
    parser.add_argument('--cuda_ray', action='store_true', help="use CUDA raymarching instead of pytorch")
-    parser.add_argument('--max_steps', type=int, default=16,
-                        help="max num steps sampled per ray (only valid when using --cuda_ray)")
-    parser.add_argument('--num_steps', type=int, default=16,
-                        help="num steps sampled per ray (only valid when NOT using --cuda_ray)")
-    parser.add_argument('--upsample_steps', type=int, default=0,
-                        help="num steps up-sampled per ray (only valid when NOT using --cuda_ray)")
-    parser.add_argument('--update_extra_interval', type=int, default=16,
-                        help="iter interval to update extra status (only valid when using --cuda_ray)")
-    parser.add_argument('--max_ray_batch', type=int, default=4096,
-                        help="batch size of rays at inference to avoid OOM (only valid when NOT using --cuda_ray)")
+    parser.add_argument('--max_steps', type=int, default=16, help="max num steps sampled per ray (only valid when using --cuda_ray)")
+    parser.add_argument('--num_steps', type=int, default=16, help="num steps sampled per ray (only valid when NOT using --cuda_ray)")
+    parser.add_argument('--upsample_steps', type=int, default=0, help="num steps up-sampled per ray (only valid when NOT using --cuda_ray)")
+    parser.add_argument('--update_extra_interval', type=int, default=16, help="iter interval to update extra status (only valid when using --cuda_ray)")
+    parser.add_argument('--max_ray_batch', type=int, default=4096, help="batch size of rays at inference to avoid OOM (only valid when NOT using --cuda_ray)")

    ### loss set
    parser.add_argument('--warmup_step', type=int, default=10000, help="warm up steps")
@ -231,31 +206,23 @@ if __name__ == '__main__':
    parser.add_argument('--bg_img', type=str, default='white', help="background image")
    parser.add_argument('--fbg', action='store_true', help="frame-wise bg")
    parser.add_argument('--exp_eye', action='store_true', help="explicitly control the eyes")
-    parser.add_argument('--fix_eye', type=float, default=-1,
-                        help="fixed eye area, negative to disable, set to 0-0.3 for a reasonable eye")
+    parser.add_argument('--fix_eye', type=float, default=-1, help="fixed eye area, negative to disable, set to 0-0.3 for a reasonable eye")
    parser.add_argument('--smooth_eye', action='store_true', help="smooth the eye area sequence")

-    parser.add_argument('--torso_shrink', type=float, default=0.8,
-                        help="shrink bg coords to allow more flexibility in deform")
+    parser.add_argument('--torso_shrink', type=float, default=0.8, help="shrink bg coords to allow more flexibility in deform")

    ### dataset options
    parser.add_argument('--color_space', type=str, default='srgb', help="Color space, supports (linear, srgb)")
-    parser.add_argument('--preload', type=int, default=0,
-                        help="0 means load data from disk on-the-fly, 1 means preload to CPU, 2 means GPU.")
+    parser.add_argument('--preload', type=int, default=0, help="0 means load data from disk on-the-fly, 1 means preload to CPU, 2 means GPU.")
    # (the default value is for the fox dataset)
-    parser.add_argument('--bound', type=float, default=1,
-                        help="assume the scene is bounded in box[-bound, bound]^3, if > 1, will invoke adaptive ray marching.")
+    parser.add_argument('--bound', type=float, default=1, help="assume the scene is bounded in box[-bound, bound]^3, if > 1, will invoke adaptive ray marching.")
    parser.add_argument('--scale', type=float, default=4, help="scale camera location into box[-bound, bound]^3")
    parser.add_argument('--offset', type=float, nargs='*', default=[0, 0, 0], help="offset of camera location")
-    parser.add_argument('--dt_gamma', type=float, default=1 / 256,
-                        help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
+    parser.add_argument('--dt_gamma', type=float, default=1/256, help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
    parser.add_argument('--min_near', type=float, default=0.05, help="minimum near distance for camera")
-    parser.add_argument('--density_thresh', type=float, default=10,
-                        help="threshold for density grid to be occupied (sigma)")
-    parser.add_argument('--density_thresh_torso', type=float, default=0.01,
-                        help="threshold for density grid to be occupied (alpha)")
-    parser.add_argument('--patch_size', type=int, default=1,
-                        help="[experimental] render patches in training, so as to apply LPIPS loss. 1 means disabled, use [64, 32, 16] to enable")
+    parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied (sigma)")
+    parser.add_argument('--density_thresh_torso', type=float, default=0.01, help="threshold for density grid to be occupied (alpha)")
+    parser.add_argument('--patch_size', type=int, default=1, help="[experimental] render patches in training, so as to apply LPIPS loss. 1 means disabled, use [64, 32, 16] to enable")

    parser.add_argument('--init_lips', action='store_true', help="init lips region")
    parser.add_argument('--finetune_lips', action='store_true', help="use LPIPS and landmarks to fine tune lips region")
@ -273,15 +240,12 @@ if __name__ == '__main__':
    parser.add_argument('--max_spp', type=int, default=1, help="GUI rendering max sample per pixel")

    ### else
-    parser.add_argument('--att', type=int, default=2,
-                        help="audio attention mode (0 = turn off, 1 = left-direction, 2 = bi-direction)")
-    parser.add_argument('--aud', type=str, default='',
-                        help="audio source (empty will load the default, else should be a path to a npy file)")
+    parser.add_argument('--att', type=int, default=2, help="audio attention mode (0 = turn off, 1 = left-direction, 2 = bi-direction)")
+    parser.add_argument('--aud', type=str, default='', help="audio source (empty will load the default, else should be a path to a npy file)")
    parser.add_argument('--emb', action='store_true', help="use audio class + embedding instead of logits")

    parser.add_argument('--ind_dim', type=int, default=4, help="individual code dim, 0 to turn off")
-    parser.add_argument('--ind_num', type=int, default=10000,
-                        help="number of individual codes, should be larger than training dataset size")
+    parser.add_argument('--ind_num', type=int, default=10000, help="number of individual codes, should be larger than training dataset size")

    parser.add_argument('--ind_dim_torso', type=int, default=8, help="individual code dim, 0 to turn off")

@ -290,8 +254,7 @@ if __name__ == '__main__':
    parser.add_argument('--part2', action='store_true', help="use partial training data (first 15s)")

    parser.add_argument('--train_camera', action='store_true', help="optimize camera pose")
-    parser.add_argument('--smooth_path', action='store_true',
-                        help="brute-force smooth camera pose trajectory with a window size")
+    parser.add_argument('--smooth_path', action='store_true', help="brute-force smooth camera pose trajectory with a window size")
    parser.add_argument('--smooth_path_window', type=int, default=7, help="smoothing window size")

    # asr
@ -325,7 +288,6 @@ if __name__ == '__main__':
    parser.add_argument('--batch_size', type=int, default=16)

    parser.add_argument('--customvideo', action='store_true', help="custom video")
-    parser.add_argument('--static_img', action='store_true', help="Use the first photo as a time of rest")
    parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
    parser.add_argument('--customvideo_imgnum', type=int, default=1)

@ -339,8 +301,7 @@ if __name__ == '__main__':
    parser.add_argument('--model', type=str, default='ernerf') #musetalk wav2lip

    parser.add_argument('--transport', type=str, default='rtcpush') #rtmp webrtc rtcpush
-    parser.add_argument('--push_url', type=str,
-                        default='http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream')  # rtmp://localhost/live/livestream
+    parser.add_argument('--push_url', type=str, default='http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream') #rtmp://localhost/live/livestream

    parser.add_argument('--listenport', type=int, default=8010)

@ -353,7 +314,6 @@ if __name__ == '__main__':
        from ernerf.nerf_triplane.utils import *
        from ernerf.nerf_triplane.network import NeRFNetwork
        from nerfreal import NeRFReal
-
        # assert test mode
        opt.test = True
        opt.test_train = False
@ -388,8 +348,7 @@ if __name__ == '__main__':
        criterion = torch.nn.MSELoss(reduction='none')
        metrics = [] # use no metric in GUI for faster initialization...
        print(model)
-        trainer = Trainer('ngp', opt, model, device=device, workspace=opt.workspace, criterion=criterion, fp16=opt.fp16,
-                          metrics=metrics, use_checkpoint=opt.ckpt)
+        trainer = Trainer('ngp', opt, model, device=device, workspace=opt.workspace, criterion=criterion, fp16=opt.fp16, metrics=metrics, use_checkpoint=opt.ckpt)

        test_loader = NeRFDataset_Test(opt, device=device).dataloader()
        model.aud_features = test_loader._data.auds
@ -399,12 +358,10 @@ if __name__ == '__main__':
        nerfreal = NeRFReal(opt, trainer, test_loader)
    elif opt.model == 'musetalk':
        from musereal import MuseReal
-
        print(opt)
        nerfreal = MuseReal(opt)
    elif opt.model == 'wav2lip':
        from lipreal import LipReal
-
        print(opt)
        nerfreal = LipReal(opt)

@ -419,7 +376,6 @@ if __name__ == '__main__':
    appasync.on_shutdown.append(on_shutdown)
    appasync.router.add_post("/offer", offer)
    appasync.router.add_post("/human", human)
-    appasync.router.add_post("/create_musetalk", handle_create_musetalk)
    appasync.router.add_static('/',path='web')

    # Configure default CORS settings.
@ -434,7 +390,6 @@ if __name__ == '__main__':
    for route in list(appasync.router.routes()):
        cors.add(route)

-
    def run_server(runner):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -444,8 +399,6 @@ if __name__ == '__main__':
        if opt.transport=='rtcpush':
            loop.run_until_complete(run(opt.push_url))
        loop.run_forever()    
-
-
    Thread(target=run_server, args=(web.AppRunner(appasync),)).start()

    print('start websocket server')
@ -453,3 +406,5 @@ if __name__ == '__main__':
    #app.router.add_post("/offer", offer)
    server = pywsgi.WSGIServer(('0.0.0.0', 8000), app, handler_class=WebSocketHandler)
    server.serve_forever()
+    
+    
--- a/asrreal.py
+++ b/asrreal.py
@ -4,29 +4,19 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoModelForCTC, AutoProcessor, Wav2Vec2Processor, HubertModel

-#import pyaudio
-import soundfile as sf
-import resampy

 import queue
 from queue import Queue
 #from collections import deque
 from threading import Thread, Event
-from io import BytesIO

-class ASR:
+from baseasr import BaseASR
+
+class ASR(BaseASR):
    def __init__(self, opt):
-
-        self.opt = opt
-
-        self.play = opt.asr_play #false 
+        super().__init__(opt)

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        self.fps = opt.fps # 20 ms per frame
-        self.sample_rate = 16000
-        self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
-        self.mode = 'live' if opt.asr_wav == '' else 'file'
-
        if 'esperanto' in self.opt.asr_model:
            self.audio_dim = 44
        elif 'deepspeech' in self.opt.asr_model:
@ -41,30 +31,11 @@ class ASR:
        self.context_size = opt.m
        self.stride_left_size = opt.l
        self.stride_right_size = opt.r
-        self.text = '[START]\n'
-        self.terminated = False
-        self.frames = []
-        self.inwarm = False

        # pad left frames
        if self.stride_left_size > 0:
            self.frames.extend([np.zeros(self.chunk, dtype=np.float32)] * self.stride_left_size)

-
-        self.exit_event = Event()
-        #self.audio_instance = pyaudio.PyAudio()  #not need
-
-        # create input stream
-        self.queue = Queue()
-        self.output_queue = Queue()
-        # start a background process to read frames
-        #self.input_stream = self.audio_instance.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, output=False, frames_per_buffer=self.chunk)
-        #self.queue = Queue()
-        #self.process_read_frame = Thread(target=_read_frame, args=(self.input_stream, self.exit_event, self.queue, self.chunk))
-        
-        # current location of audio
-        self.idx = 0
-
        # create wav2vec model
        print(f'[INFO] loading ASR model {self.opt.asr_model}...')
        if 'hubert' in self.opt.asr_model:
@ -74,10 +45,6 @@ class ASR:
            self.processor = AutoProcessor.from_pretrained(opt.asr_model)
            self.model = AutoModelForCTC.from_pretrained(opt.asr_model).to(self.device)

-        # prepare to save logits
-        if self.opt.asr_save_feats:
-            self.all_feats = []
-
        # the extracted features 
        # use a loop queue to efficiently record endless features: [f--t---][-------][-------]
        self.feat_buffer_size = 4
@ -93,8 +60,16 @@ class ASR:
        # warm up steps needed: mid + right + window_size + attention_size
        self.warm_up_steps = self.context_size + self.stride_left_size + self.stride_right_size #+ self.stride_left_size   #+ 8 + 2 * 3

-        self.listening = False
-        self.playing = False
+    def get_audio_frame(self):         
+        try:
+            frame = self.queue.get(block=False)
+            type = 0
+            #print(f'[INFO] get frame {frame.shape}')
+        except queue.Empty:
+            frame = np.zeros(self.chunk, dtype=np.float32)
+            type = 1
+
+        return frame,type

    def get_next_feat(self): #get audio embedding to nerf
        # return a [1/8, 16] window, for the next input to nerf side.
@ -136,17 +111,8 @@ class ASR:

    def run_step(self):

-        if self.terminated:
-            return
-
        # get a frame of audio
-        frame,type = self.__get_audio_frame()
-        
-        # the last frame
-        if frame is None:
-            # terminate, but always run the network for the left frames
-            self.terminated = True
-        else:
+        frame,type = self.get_audio_frame()
        self.frames.append(frame)
        # put to output
        self.output_queue.put((frame,type))
@ -157,7 +123,6 @@ class ASR:
        inputs = np.concatenate(self.frames) # [N * chunk]

        # discard the old part to save memory
-        if not self.terminated:
        self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]

        #print(f'[INFO] frame_to_text... ')
@ -166,10 +131,6 @@ class ASR:
        #print(f'-------wav2vec time:{time.time()-t:.4f}s')
        feats = logits # better lips-sync than labels

-        # save feats
-        if self.opt.asr_save_feats:
-            self.all_feats.append(feats)
-
        # record the feats efficiently.. (no concat, constant memory)
        start = self.feat_buffer_idx * self.context_size
        end = start + feats.shape[0]
@ -203,24 +164,6 @@ class ASR:
        #         np.save(output_path, unfold_feats.cpu().numpy())
        #         print(f"[INFO] saved logits to {output_path}")
    
-    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
-        self.queue.put(audio_chunk)
-        
-    def __get_audio_frame(self): 
-        if self.inwarm: # warm up
-            return np.zeros(self.chunk, dtype=np.float32),1
-        
-        try:
-            frame = self.queue.get(block=False)
-            type = 0
-            print(f'[INFO] get frame {frame.shape}')
-        except queue.Empty:
-            frame = np.zeros(self.chunk, dtype=np.float32)
-            type = 1
-
-        self.idx = self.idx + self.chunk
-
-        return frame,type

        
    def __frame_to_text(self, frame):
@ -241,8 +184,8 @@ class ASR:
        right = min(logits.shape[1], logits.shape[1] - self.stride_right_size + 1) # +1 to make sure output is the same length as input.

        # do not cut right if terminated.
-        if self.terminated:
-            right = logits.shape[1]
+        # if self.terminated:
+        #     right = logits.shape[1]

        logits = logits[:, left:right]

@ -263,9 +206,22 @@ class ASR:
        return logits[0], None,None #predicted_ids[0], transcription # [N,]
    

-    def get_audio_out(self):  #get origin audio pcm to nerf
-        return self.output_queue.get()
+    def warm_up(self):       
+        print(f'[INFO] warm up ASR live model, expected latency = {self.warm_up_steps / self.fps:.6f}s')
+        t = time.time()
+        #for _ in range(self.stride_left_size):
+        #    self.frames.append(np.zeros(self.chunk, dtype=np.float32))
+        for _ in range(self.warm_up_steps):
+            self.run_step()
+        #if torch.cuda.is_available():
+        #    torch.cuda.synchronize()
+        t = time.time() - t
+        print(f'[INFO] warm-up done, actual latency = {t:.6f}s')

+        #self.clear_queue()
+
+    #####not used function#####################################
+    '''
    def __init_queue(self):
        self.frames = []
        self.queue.queue.clear()
@ -290,26 +246,6 @@ class ASR:
        if self.play:
            self.output_queue.queue.clear()

-    def warm_up(self):
-
-        #self.listen()
-        
-        self.inwarm = True
-        print(f'[INFO] warm up ASR live model, expected latency = {self.warm_up_steps / self.fps:.6f}s')
-        t = time.time()
-        #for _ in range(self.stride_left_size):
-        #    self.frames.append(np.zeros(self.chunk, dtype=np.float32))
-        for _ in range(self.warm_up_steps):
-            self.run_step()
-        #if torch.cuda.is_available():
-        #    torch.cuda.synchronize()
-        t = time.time() - t
-        print(f'[INFO] warm-up done, actual latency = {t:.6f}s')
-        self.inwarm = False
-
-        #self.clear_queue()
-
-    #####not used function#####################################
    def listen(self):
        # start
        if self.mode == 'live' and not self.listening:
@ -405,3 +341,4 @@ if __name__ == '__main__':

    with ASR(opt) as asr:
        asr.run()
+'''
--- a/lipasr.py
+++ b/lipasr.py
@ -6,60 +6,16 @@ import queue
 from queue import Queue
 import multiprocessing as mp

+from baseasr import BaseASR
 from wav2lip import audio

-class LipASR:
-    def __init__(self, opt):
-        self.opt = opt
-
-        self.fps = opt.fps # 20 ms per frame
-        self.sample_rate = 16000
-        self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
-        self.queue = Queue()
-        # self.input_stream = BytesIO()
-        self.output_queue = mp.Queue()
-
-        #self.audio_processor = audio_processor
-        self.batch_size = opt.batch_size
-
-        self.frames = []
-        self.stride_left_size = opt.l
-        self.stride_right_size = opt.r
-        #self.context_size = 10
-        self.feat_queue = mp.Queue(5)
-
-        self.warm_up()
-
-    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
-        self.queue.put(audio_chunk)
-
-    def __get_audio_frame(self):        
-        try:
-            frame = self.queue.get(block=True,timeout=0.01)
-            type = 0
-            #print(f'[INFO] get frame {frame.shape}')
-        except queue.Empty:
-            frame = np.zeros(self.chunk, dtype=np.float32)
-            type = 1
-
-        return frame,type 
-
-    def get_audio_out(self):  #get origin audio pcm to nerf
-        return self.output_queue.get()
-    
-    def warm_up(self):
-        for _ in range(self.stride_left_size + self.stride_right_size):
-            audio_frame,type=self.__get_audio_frame()
-            self.frames.append(audio_frame)
-            self.output_queue.put((audio_frame,type))
-        for _ in range(self.stride_left_size):
-            self.output_queue.get()
+class LipASR(BaseASR):

    def run_step(self):
        ############################################## extract audio feature ##############################################
        # get a frame of audio
        for _ in range(self.batch_size*2):
-            frame,type = self.__get_audio_frame()
+            frame,type = self.get_audio_frame()
            self.frames.append(frame)
            # put to output
            self.output_queue.put((frame,type))
@ -89,7 +45,3 @@ class LipASR:
        
        # discard the old part to save memory
        self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]
-
-
-    def get_next_feat(self,block,timeout):        
-        return self.feat_queue.get(block,timeout)
--- a/lipreal.py
+++ b/lipreal.py
@ -164,6 +164,7 @@ class LipReal:
        self.__loadavatar()

        self.asr = LipASR(opt)
+        self.asr.warm_up()
        if opt.tts == "edgetts":
            self.tts = EdgeTTS(opt,self)
        elif opt.tts == "gpt-sovits":
@ -200,6 +201,10 @@ class LipReal:
    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
        self.asr.put_audio_frame(audio_chunk)

+    def pause_talk(self):
+        self.tts.pause_talk()
+        self.asr.pause_talk()
+      

    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
        
@ -257,9 +262,12 @@ class LipReal:
            t = time.perf_counter()
            self.asr.run_step()

-            if video_track._queue.qsize()>=2*self.opt.batch_size:
+            # if video_track._queue.qsize()>=2*self.opt.batch_size:
+            #     print('sleep qsize=',video_track._queue.qsize())
+            #     time.sleep(0.04*video_track._queue.qsize()*0.8)
+            if video_track._queue.qsize()>=5:
                print('sleep qsize=',video_track._queue.qsize())
-                time.sleep(0.04*self.opt.batch_size*1.5)
+                time.sleep(0.04*video_track._queue.qsize()*0.8)
                
            # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
            # if delay > 0:
--- a/museasr.py
+++ b/museasr.py
@ -1,65 +1,22 @@
 import time
-import torch
 import numpy as np

 import queue
 from queue import Queue
 import multiprocessing as mp
-
+from baseasr import BaseASR
 from musetalk.whisper.audio2feature import Audio2Feature

-class MuseASR:
+class MuseASR(BaseASR):
    def __init__(self, opt, audio_processor:Audio2Feature):
-        self.opt = opt
-
-        self.fps = opt.fps # 20 ms per frame
-        self.sample_rate = 16000
-        self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
-        self.queue = Queue()
-        # self.input_stream = BytesIO()
-        self.output_queue = mp.Queue()
-
+        super().__init__(opt)
        self.audio_processor = audio_processor
-        self.batch_size = opt.batch_size
-
-        self.frames = []
-        self.stride_left_size = opt.l
-        self.stride_right_size = opt.r
-        self.feat_queue = mp.Queue(5)
-
-        self.warm_up()
-
-    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
-        self.queue.put(audio_chunk)
-
-    def __get_audio_frame(self):        
-        try:
-            frame = self.queue.get(block=True,timeout=0.01)
-            type = 0
-            #print(f'[INFO] get frame {frame.shape}')
-        except queue.Empty:
-            frame = np.zeros(self.chunk, dtype=np.float32)
-            type = 1
-
-        return frame,type 
-
-    def get_audio_out(self):  #get origin audio pcm to nerf
-        return self.output_queue.get()
-    
-    def warm_up(self):
-        for _ in range(self.stride_left_size + self.stride_right_size):
-            audio_frame,type=self.__get_audio_frame()
-            self.frames.append(audio_frame)
-            self.output_queue.put((audio_frame,type))
-
-        for _ in range(self.stride_left_size):
-            self.output_queue.get()

    def run_step(self):
        ############################################## extract audio feature ##############################################
        start_time = time.time()
        for _ in range(self.batch_size*2):
-            audio_frame,type=self.__get_audio_frame()
+            audio_frame,type=self.get_audio_frame()
            self.frames.append(audio_frame)
            self.output_queue.put((audio_frame,type))
        
@ -77,6 +34,3 @@ class MuseASR:
        self.feat_queue.put(whisper_chunks)
        # discard the old part to save memory
        self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]
-
-    def get_next_feat(self,block,timeout):        
-        return self.feat_queue.get(block,timeout)
--- a/musereal.py
+++ b/musereal.py
@ -29,8 +29,6 @@ import asyncio
 from av import AudioFrame, VideoFrame

 from tqdm import tqdm
-
-
 def read_imgs(img_list):
    frames = []
    print('reading images...')
@ -39,7 +37,6 @@ def read_imgs(img_list):
        frames.append(frame)
    return frames

-
 def __mirror_index(size, index):
    #size = len(self.coord_list_cycle)
    turn = index // size
@ -49,7 +46,6 @@ def __mirror_index(size, index):
    else:
        return size - res - 1 

-
 def inference(render_event,batch_size,latents_out_path,audio_feat_queue,audio_out_queue,res_frame_queue,
              ): #vae, unet, pe,timesteps
    
@ -128,7 +124,6 @@ def inference(render_event, batch_size, latents_out_path, audio_feat_queue, audi
            time.sleep(1)
    print('musereal inference processor stop')

-
@torch.no_grad()
 class MuseReal:
    def __init__(self, opt):
@ -140,7 +135,6 @@ class MuseReal:

        #### musetalk
        self.avatar_id = opt.avatar_id
-        self.static_img = opt.static_img
        self.video_path = '' #video_path
        self.bbox_shift = opt.bbox_shift
        self.avatar_path = f"./data/avatars/{self.avatar_id}"
@ -163,6 +157,7 @@ class MuseReal:
        self.__loadavatar()

        self.asr = MuseASR(opt,self.audio_processor)
+        self.asr.warm_up()
        if opt.tts == "edgetts":
            self.tts = EdgeTTS(opt,self)
        elif opt.tts == "gpt-sovits":
@ -199,12 +194,18 @@ class MuseReal:
        input_mask_list = sorted(input_mask_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
        self.mask_list_cycle = read_imgs(input_mask_list)
        
+    
    def put_msg_txt(self,msg):
        self.tts.put_msg_txt(msg)
    
    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
        self.asr.put_audio_frame(audio_chunk)

+    def pause_talk(self):
+        self.tts.pause_talk()
+        self.asr.pause_talk()
+    
+
    def __mirror_index(self, index):
        size = len(self.coord_list_cycle)
        turn = index // size
@ -237,6 +238,7 @@ class MuseReal:
                                    encoder_hidden_states=audio_feature_batch).sample
        recon = self.vae.decode_latents(pred_latents)
      
+
    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
        
        while not quit_event.is_set():
@ -245,9 +247,6 @@ class MuseReal:
            except queue.Empty:
                continue
            if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据，只需要取fullimg
-                if self.static_img:
-                    combine_frame = self.frame_list_cycle[0]
-                else:
                combine_frame = self.frame_list_cycle[idx]
            else:
                bbox = self.coord_list_cycle[idx]
@ -304,12 +303,16 @@ class MuseReal:
            #     print(f"------actual avg infer fps:{count/totaltime:.4f}")
            #     count=0
            #     totaltime=0
-            if video_track._queue.qsize() >= 2 * self.opt.batch_size:
+            if video_track._queue.qsize()>=1.5*self.opt.batch_size:
                print('sleep qsize=',video_track._queue.qsize())
-                time.sleep(0.04 * self.opt.batch_size * 1.5)
+                time.sleep(0.04*video_track._queue.qsize()*0.8)
+            # if video_track._queue.qsize()>=5:
+            #     print('sleep qsize=',video_track._queue.qsize())
+            #     time.sleep(0.04*video_track._queue.qsize()*0.8)
                
            # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
            # if delay > 0:
            #     time.sleep(delay)
        self.render_event.clear() #end infer process render
        print('musereal thread stop')
+            
--- a/musetalk/utils/face_parsing/init.py
+++ b/musetalk/utils/face_parsing/init.py
@ -7,14 +7,15 @@ from PIL import Image
 from .model import BiSeNet
 import torchvision.transforms as transforms

-
 class FaceParsing():
    def __init__(self,resnet_path='./models/face-parse-bisent/resnet18-5c106cde.pth',
                   model_pth='./models/face-parse-bisent/79999_iter.pth'):
        self.net = self.model_init(resnet_path,model_pth)
        self.preprocess = self.image_preprocess()

-    def model_init(self,resnet_path, model_pth):
+    def model_init(self,
+                   resnet_path,
+                   model_pth):
        net = BiSeNet(resnet_path)
        if torch.cuda.is_available():
            net.cuda()
@ -49,8 +50,8 @@ class FaceParsing():
        parsing = Image.fromarray(parsing.astype(np.uint8))
        return parsing

-
 if __name__ == "__main__":
    fp = FaceParsing()
    segmap = fp('154_small.png')
    segmap.save('res.png')
+
--- a/nerfreal.py
+++ b/nerfreal.py
@ -20,9 +20,6 @@ class NeRFReal:
        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
        self.W = opt.W
        self.H = opt.H
-        self.debug = debug
-        self.training = False
-        self.step = 0 # training step 

        self.trainer = trainer
        self.data_loader = data_loader
@ -44,7 +41,6 @@ class NeRFReal:
        #self.eye_area = None if not self.opt.exp_eye else data_loader._data.eye_area.mean().item()

        # playing seq from dataloader, or pause.
-        self.playing = True #False todo
        self.loader = iter(data_loader)

        #self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
@ -62,7 +58,6 @@ class NeRFReal:
        self.customimg_index = 0

        # build asr
-        if self.opt.asr:
        self.asr = ASR(opt)
        self.asr.warm_up()
        if opt.tts == "edgetts":
@ -124,6 +119,10 @@ class NeRFReal:
    def put_audio_frame(self,audio_chunk): #16khz 20ms pcm
        self.asr.put_audio_frame(audio_chunk)

+    def pause_talk(self):
+        self.tts.pause_talk()
+        self.asr.pause_talk()   
+    

    def mirror_index(self, index):
        size = self.opt.customvideo_imgnum
@ -248,7 +247,6 @@ class NeRFReal:
            # update texture every frame
            # audio stream thread...
            t = time.perf_counter()
-            if self.opt.asr and self.playing:
            # run 2 ASR steps (audio is at 50FPS, video is at 25FPS)
            for _ in range(2):
                self.asr.run_step()
@ -267,7 +265,7 @@ class NeRFReal:
            else:
                if video_track._queue.qsize()>=5:
                    #print('sleep qsize=',video_track._queue.qsize())
-                    time.sleep(0.1)
+                    time.sleep(0.04*video_track._queue.qsize()*0.8)
        print('nerfreal thread stop')
            
            
--- a/ttsreal.py
+++ b/ttsreal.py
@ -13,6 +13,11 @@ import queue
 from queue import Queue
 from io import BytesIO
 from threading import Thread, Event
+from enum import Enum
+
+class State(Enum):
+    RUNNING=0
+    PAUSE=1

 class BaseTTS:
    def __init__(self, opt, parent):
@ -25,6 +30,11 @@ class BaseTTS:
        self.input_stream = BytesIO()

        self.msgqueue = Queue()
+        self.state = State.RUNNING
+
+    def pause_talk(self):
+        self.msgqueue.queue.clear()
+        self.state = State.PAUSE

    def put_msg_txt(self,msg): 
        self.msgqueue.put(msg)
@ -37,6 +47,7 @@ class BaseTTS:
        while not quit_event.is_set():
            try:
                msg = self.msgqueue.get(block=True, timeout=1)
+                self.state=State.RUNNING
            except queue.Empty:
                continue
            self.txt_to_audio(msg)
@ -59,7 +70,7 @@ class EdgeTTS(BaseTTS):
        stream = self.__create_bytes_stream(self.input_stream)
        streamlen = stream.shape[0]
        idx=0
-        while streamlen >= self.chunk:
+        while streamlen >= self.chunk and self.state==State.RUNNING:
            self.parent.put_audio_frame(stream[idx:idx+self.chunk])
            streamlen -= self.chunk
            idx += self.chunk
@ -92,7 +103,7 @@ class EdgeTTS(BaseTTS):
        async for chunk in communicate.stream():
            if first:
                first = False
-            if chunk["type"] == "audio":
+            if chunk["type"] == "audio" and self.state==State.RUNNING:
                #self.push_audio(chunk["data"])
                self.input_stream.write(chunk["data"])
                #file.write(chunk["data"])
@ -147,7 +158,7 @@ class VoitsTTS(BaseTTS):
                end = time.perf_counter()
                print(f"gpt_sovits Time to first chunk: {end-start}s")
                first = False
-            if chunk:
+            if chunk and self.state==State.RUNNING:
                yield chunk

        print("gpt_sovits response.elapsed:", res.elapsed)
--- a/web/chat.html
+++ b/web/chat.html
@ -29,22 +29,22 @@

 	$(document).ready(function() {
 	  var host = window.location.hostname
-	  var ws = new WebSocket("ws://"+host+":8000/humanchat");
-	  //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
-	  ws.onopen = function() {
-		console.log('Connected');
-	  };
-	  ws.onmessage = function(e) {
-		console.log('Received: ' + e.data);
-		data = e
-		var vid = JSON.parse(data.data); 
-		console.log(typeof(vid),vid)
-		//document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+	//   var ws = new WebSocket("ws://"+host+":8000/humanecho");
+	//   //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	//   ws.onopen = function() {
+	// 	console.log('Connected');
+	//   };
+	//   ws.onmessage = function(e) {
+	// 	console.log('Received: ' + e.data);
+	// 	data = e
+	// 	var vid = JSON.parse(data.data); 
+	// 	console.log(typeof(vid),vid)
+	// 	//document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
 		
-	  };
-	  ws.onclose = function(e) {
-		console.log('Closed');
-	  };
+	//   };
+	//   ws.onclose = function(e) {
+	// 	console.log('Closed');
+	//   };

 	  flvPlayer = mpegts.createPlayer({type: 'flv', url: "http://"+host+":8080/live/livestream.flv", isLive: true, enableStashBuffer: false});
 	  flvPlayer.attachMediaElement(document.getElementById('video_player'));
@ -55,7 +55,17 @@
 		e.preventDefault();
 		var message = $('#message').val();
 		console.log('Sending: ' + message);
-		ws.send(message);
+		fetch('/human', {
+				body: JSON.stringify({
+					text: message,
+					type: 'chat',
+				}),
+				headers: {
+					'Content-Type': 'application/json'
+				},
+				method: 'POST'
+		});
+		//ws.send(message);
 		$('#message').val('');
 		});
 	});
--- a/web/rtcpushchat.html
+++ b/web/rtcpushchat.html
@ -51,29 +51,39 @@
 <script type="text/javascript" charset="utf-8">

 	$(document).ready(function() {
-	  var host = window.location.hostname
-	  var ws = new WebSocket("ws://"+host+":8000/humanchat");
-	  //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
-	  ws.onopen = function() {
-		console.log('Connected');
-	  };
-	  ws.onmessage = function(e) {
-		console.log('Received: ' + e.data);
-		data = e
-		var vid = JSON.parse(data.data); 
-		console.log(typeof(vid),vid)
-		//document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+	  // var host = window.location.hostname
+	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
+	  // //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  // ws.onopen = function() {
+		// console.log('Connected');
+	  // };
+	  // ws.onmessage = function(e) {
+		// console.log('Received: ' + e.data);
+		// data = e
+		// var vid = JSON.parse(data.data); 
+		// console.log(typeof(vid),vid)
+		// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
 		
-	  };
-	  ws.onclose = function(e) {
-		console.log('Closed');
-	  };
+	  // };
+	  // ws.onclose = function(e) {
+		// console.log('Closed');
+	  // };

 	  $('#echo-form').on('submit', function(e) {
      e.preventDefault();
      var message = $('#message').val();
      console.log('Sending: ' + message);
-		ws.send(message);
+      fetch('/human', {
+            body: JSON.stringify({
+                text: message,
+                type: 'chat',
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+      //ws.send(message);
      $('#message').val('');
 	  });
 	});
--- a/web/webrtcapi.html
+++ b/web/webrtcapi.html
@ -79,6 +79,7 @@
            body: JSON.stringify({
                text: message,
                type: 'echo',
+                interrupt: true,
            }),
            headers: {
                'Content-Type': 'application/json'
--- a/web/webrtcchat.html
+++ b/web/webrtcchat.html
@ -53,29 +53,40 @@
 <script type="text/javascript" charset="utf-8">

 	$(document).ready(function() {
-	  var host = window.location.hostname
-	  var ws = new WebSocket("ws://"+host+":8000/humanchat");
-	  //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
-	  ws.onopen = function() {
-		console.log('Connected');
-	  };
-	  ws.onmessage = function(e) {
-		console.log('Received: ' + e.data);
-		data = e
-		var vid = JSON.parse(data.data); 
-		console.log(typeof(vid),vid)
-		//document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+	  // var host = window.location.hostname
+	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
+	  // //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  // ws.onopen = function() {
+		// console.log('Connected');
+	  // };
+	  // ws.onmessage = function(e) {
+		// console.log('Received: ' + e.data);
+		// data = e
+		// var vid = JSON.parse(data.data); 
+		// console.log(typeof(vid),vid)
+		// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
 		
-	  };
-	  ws.onclose = function(e) {
-		console.log('Closed');
-	  };
+	  // };
+	  // ws.onclose = function(e) {
+		// console.log('Closed');
+	  // };

 	  $('#echo-form').on('submit', function(e) {
      e.preventDefault();
      var message = $('#message').val();
      console.log('Sending: ' + message);
-		ws.send(message);
+      fetch('/human', {
+            body: JSON.stringify({
+                text: message,
+                type: 'chat',
+                interrupt: true,
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+      //ws.send(message);
      $('#message').val('');
 	  });
 	});