add wav2lip customvideo

2024-08-03 08:26:17 +08:00 · 2024-08-03 08:26:17 +08:00 · 391512f68c
parent 0c63e9a11b
commit 391512f68c
8 changed files with 248 additions and 13 deletions
--- a/app.py
+++ b/app.py
@ -140,7 +140,7 @@ async def human(request):
    if params['type']=='echo':
        nerfreals[sessionid].put_msg_txt(params['text'])
    elif params['type']=='chat':
-        res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text']))                          
+        res=await asyncio.get_event_loop().run_in_executor(None, llm_response(params['text']))                         
        nerfreals[sessionid].put_msg_txt(res)

    return web.Response(
@ -150,6 +150,19 @@ async def human(request):
        ),
    )

+async def set_audiotype(request):
+    params = await request.json()
+
+    sessionid = params.get('sessionid',0)    
+    nerfreals[sessionid].set_curr_state(params['audiotype'],params['reinit'])
+
+    return web.Response(
+        content_type="application/json",
+        text=json.dumps(
+            {"code": 0, "data":"ok"}
+        ),
+    )
+
 async def on_shutdown(app):
    # close peer connections
    coros = [pc.close() for pc in pcs]
@ -307,6 +320,8 @@ if __name__ == '__main__':
    parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
    parser.add_argument('--customvideo_imgnum', type=int, default=1)

+    parser.add_argument('--customvideo_config', type=str, default='')
+
    parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
    parser.add_argument('--REF_FILE', type=str, default=None)
    parser.add_argument('--REF_TEXT', type=str, default=None)
@ -325,6 +340,10 @@ if __name__ == '__main__':
    opt = parser.parse_args()
    #app.config.from_object(opt)
    #print(app.config)
+    opt.customopt = []
+    if opt.customvideo_config!='':
+        with open(opt.customvideo_config,'r') as file:
+            opt.customopt = json.load(file)

    if opt.model == 'ernerf':
        from ernerf.nerf_triplane.provider import NeRFDataset_Test
@ -402,6 +421,7 @@ if __name__ == '__main__':
    appasync.on_shutdown.append(on_shutdown)
    appasync.router.add_post("/offer", offer)
    appasync.router.add_post("/human", human)
+    appasync.router.add_post("/set_audiotype", set_audiotype)
    appasync.router.add_static('/',path='web')

    # Configure default CORS settings.
--- a/baseasr.py
+++ b/baseasr.py
@ -7,8 +7,9 @@ import multiprocessing as mp


 class BaseASR:
-    def __init__(self, opt):
+    def __init__(self, opt, parent=None):
        self.opt = opt
+        self.parent = parent

        self.fps = opt.fps # 20 ms per frame
        self.sample_rate = 16000
@ -38,8 +39,12 @@ class BaseASR:
            type = 0
            #print(f'[INFO] get frame {frame.shape}')
        except queue.Empty:
-            frame = np.zeros(self.chunk, dtype=np.float32)
-            type = 1
+            if self.parent and self.parent.curr_state>1: #播放自定义音频
+                frame = self.parent.get_audio_stream(self.parent.curr_state)
+                type = self.parent.curr_state
+            else:
+                frame = np.zeros(self.chunk, dtype=np.float32)
+                type = 1

        return frame,type 

--- a/basereal.py
+++ b/basereal.py
@ -0,0 +1,81 @@
+import math
+import torch
+import numpy as np
+
+import os
+import time
+import cv2
+import glob
+import pickle
+import copy
+
+import queue
+from queue import Queue
+from threading import Thread, Event
+from io import BytesIO
+import soundfile as sf
+
+from tqdm import tqdm
+def read_imgs(img_list):
+    frames = []
+    print('reading images...')
+    for img_path in tqdm(img_list):
+        frame = cv2.imread(img_path)
+        frames.append(frame)
+    return frames
+
+class BaseReal:
+    def __init__(self, opt):
+        self.opt = opt
+        self.sample_rate = 16000
+        self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000)
+
+        self.curr_state=0
+        self.custom_img_cycle = {}
+        self.custom_audio_cycle = {}
+        self.custom_audio_index = {}
+        self.custom_index = {}
+        self.custom_opt = {}
+        self.__loadcustom()
+    
+    def __loadcustom(self):
+        for item in self.opt.customopt:
+            print(item)
+            input_img_list = glob.glob(os.path.join(item['imgpath'], '*.[jpJP][pnPN]*[gG]'))
+            input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
+            self.custom_img_cycle[item['audiotype']] = read_imgs(input_img_list)
+            self.custom_audio_cycle[item['audiotype']], sample_rate = sf.read(item['audiopath'], dtype='float32')
+            self.custom_audio_index[item['audiotype']] = 0
+            self.custom_index[item['audiotype']] = 0
+            self.custom_opt[item['audiotype']] = item
+    
+    def mirror_index(self,size, index):
+        #size = len(self.coord_list_cycle)
+        turn = index // size
+        res = index % size
+        if turn % 2 == 0:
+            return res
+        else:
+            return size - res - 1 
+    
+    def get_audio_stream(self,audiotype):
+        idx = self.custom_audio_index[audiotype]
+        stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
+        self.custom_audio_index[audiotype] += self.chunk
+        if self.custom_audio_index[audiotype]>=stream.shape[0]:
+            self.curr_state = 1  #当前视频不循环播放，切换到静音状态
+        return stream
+    
+    def set_curr_state(self,audiotype, reinit):
+        self.curr_state = audiotype
+        if reinit:
+            self.custom_audio_index[audiotype] = 0
+            self.custom_index[audiotype] = 0
+    
+    # def process_custom(self,audiotype:int,idx:int):
+    #     if self.curr_state!=audiotype: #从推理切到口播
+    #         if idx in self.switch_pos:  #在卡点位置可以切换
+    #             self.curr_state=audiotype
+    #             self.custom_index=0
+    #     else:
+    #         self.custom_index+=1
--- a/data/custom_config.json
+++ b/data/custom_config.json
@ -0,0 +1,7 @@
+[
+   {
+        "audiotype":2, 
+        "imgpath":"data/customvideo/image", 
+        "audiopath":"data/customvideo/audio.wav"
+    }
+]
--- a/lipreal.py
+++ b/lipreal.py
@ -23,8 +23,8 @@ from ttsreal import EdgeTTS,VoitsTTS,XTTS
 from lipasr import LipASR
 import asyncio
 from av import AudioFrame, VideoFrame
-
 from wav2lip.models import Wav2Lip
+from basereal import BaseReal

 from tqdm import tqdm

@ -143,9 +143,10 @@ def inference(render_event,batch_size,face_imgs_path,audio_feat_queue,audio_out_
    print('musereal inference processor stop')

@torch.no_grad()
-class LipReal:
+class LipReal(BaseReal):
    def __init__(self, opt):
-        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
+        super().__init__(opt)
+        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
        self.W = opt.W
        self.H = opt.H

@ -163,7 +164,7 @@ class LipReal:
        #self.__loadmodels()
        self.__loadavatar()

-        self.asr = LipASR(opt)
+        self.asr = LipASR(opt,self)
        self.asr.warm_up()
        if opt.tts == "edgetts":
            self.tts = EdgeTTS(opt,self)
@ -213,8 +214,16 @@ class LipReal:
                res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
            except queue.Empty:
                continue
-            if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据，只需要取fullimg
-                combine_frame = self.frame_list_cycle[idx]
+            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
+                audiotype = audio_frames[0][1]
+                if self.custom_index.get(audiotype) is not None: #有自定义视频
+                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
+                    combine_frame = self.custom_img_cycle[audiotype][mirindex]
+                    self.custom_index[audiotype] += 1
+                    # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
+                    #     self.curr_state = 1  #当前视频不循环播放，切换到静音状态
+                else:
+                    combine_frame = self.frame_list_cycle[idx]
            else:
                bbox = self.coord_list_cycle[idx]
                combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
--- a/llm/VllmGPT.py
+++ b/llm/VllmGPT.py
@ -15,7 +15,7 @@ class VllmGPT:
        self.__URL = "http://{}:{}/v1/completions".format(self.host, self.port)
        self.__URL2 = "http://{}:{}/v1/chat/completions".format(self.host, self.port)

-    def question(self,cont):
+    def chat(self,cont):
        chat_list = []
        # contentdb = content_db.new_instance()
        # list = contentdb.get_list('all','desc',11)
@ -77,5 +77,5 @@ class VllmGPT:
    
 if __name__ == "__main__":
    vllm = VllmGPT('192.168.1.3','8101')
-    req = vllm.question("你叫什么名字啊今年多大了")
+    req = vllm.chat("你叫什么名字啊今年多大了")
    print(req)
--- a/web/webrtcapi-custom.html
+++ b/web/webrtcapi-custom.html
@ -0,0 +1,113 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>WebRTC webcam</title>
+    <style>
+    button {
+        padding: 8px 16px;
+    }
+
+    video {
+        width: 100%;
+    }
+
+    .option {
+        margin-bottom: 8px;
+    }
+
+    #media {
+        max-width: 1280px;
+    }
+    </style>
+</head>
+<body>
+
+<div class="option">
+    <input id="use-stun" type="checkbox"/>
+    <label for="use-stun">Use STUN server</label>
+</div>
+<button id="start" onclick="start()">Start</button>
+<button id="stop" style="display: none" onclick="stop()">Stop</button>
+<input type="hidden" id="sessionid" value="0">
+<form class="form-inline" id="echo-form">
+    <div class="form-group">
+      <p>input text</p>
+
+      <textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
+    </div>
+    <button type="submit" class="btn btn-default">Send</button>
+  </form>
+
+<div id="media">
+    <h2>Media</h2>
+
+    <audio id="audio" autoplay="true"></audio>
+    <video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
+</div>
+<button id="custom" onclick="custom()">切换视频</button>
+<input type="text" id="audiotype" value="0">
+
+<script src="client.js"></script>
+<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
+<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
+</body>
+<script type="text/javascript" charset="utf-8">
+
+	$(document).ready(function() {
+	  // var host = window.location.hostname
+	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
+	  // //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
+	  // ws.onopen = function() {
+		// console.log('Connected');
+	  // };
+	  // ws.onmessage = function(e) {
+		// console.log('Received: ' + e.data);
+		// data = e
+		// var vid = JSON.parse(data.data); 
+		// console.log(typeof(vid),vid)
+		// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
+		
+	  // };
+	  // ws.onclose = function(e) {
+		// console.log('Closed');
+	  // };
+
+	  $('#echo-form').on('submit', function(e) {
+      e.preventDefault();
+      var message = $('#message').val();
+      console.log('Sending: ' + message);
+      console.log('sessionid: ',document.getElementById('sessionid').value);
+      fetch('/human', {
+            body: JSON.stringify({
+                text: message,
+                type: 'echo',
+                interrupt: true,
+                sessionid:parseInt(document.getElementById('sessionid').value),
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+      //ws.send(message);
+      $('#message').val('');
+	  });
+
+    function custom() {
+      fetch('/set_audiotype', {
+            body: JSON.stringify({
+                audiotype: parseInt(document.getElementById('audiotype').value),
+                reinit: false,
+                sessionid:parseInt(document.getElementById('sessionid').value),
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+      });
+    }
+	});
+</script>
+</html>
--- a/web/webrtcapi.html
+++ b/web/webrtcapi.html
@ -30,7 +30,7 @@
 </div>
 <button id="start" onclick="start()">Start</button>
 <button id="stop" style="display: none" onclick="stop()">Stop</button>
-<input type="hidden" id="sessionid" value="1234">
+<input type="hidden" id="sessionid" value="0">
 <form class="form-inline" id="echo-form">
    <div class="form-group">
      <p>input text</p>