add wav2lip customvideo

This commit is contained in:
lipku 2024-08-03 08:26:17 +08:00
parent 0c63e9a11b
commit 391512f68c
8 changed files with 248 additions and 13 deletions

20
app.py
View File

@ -150,6 +150,19 @@ async def human(request):
),
)
async def set_audiotype(request):
params = await request.json()
sessionid = params.get('sessionid',0)
nerfreals[sessionid].set_curr_state(params['audiotype'],params['reinit'])
return web.Response(
content_type="application/json",
text=json.dumps(
{"code": 0, "data":"ok"}
),
)
async def on_shutdown(app):
# close peer connections
coros = [pc.close() for pc in pcs]
@ -307,6 +320,8 @@ if __name__ == '__main__':
parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
parser.add_argument('--customvideo_imgnum', type=int, default=1)
parser.add_argument('--customvideo_config', type=str, default='')
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
parser.add_argument('--REF_FILE', type=str, default=None)
parser.add_argument('--REF_TEXT', type=str, default=None)
@ -325,6 +340,10 @@ if __name__ == '__main__':
opt = parser.parse_args()
#app.config.from_object(opt)
#print(app.config)
opt.customopt = []
if opt.customvideo_config!='':
with open(opt.customvideo_config,'r') as file:
opt.customopt = json.load(file)
if opt.model == 'ernerf':
from ernerf.nerf_triplane.provider import NeRFDataset_Test
@ -402,6 +421,7 @@ if __name__ == '__main__':
appasync.on_shutdown.append(on_shutdown)
appasync.router.add_post("/offer", offer)
appasync.router.add_post("/human", human)
appasync.router.add_post("/set_audiotype", set_audiotype)
appasync.router.add_static('/',path='web')
# Configure default CORS settings.

View File

@ -7,8 +7,9 @@ import multiprocessing as mp
class BaseASR:
def __init__(self, opt):
def __init__(self, opt, parent=None):
self.opt = opt
self.parent = parent
self.fps = opt.fps # 20 ms per frame
self.sample_rate = 16000
@ -38,8 +39,12 @@ class BaseASR:
type = 0
#print(f'[INFO] get frame {frame.shape}')
except queue.Empty:
frame = np.zeros(self.chunk, dtype=np.float32)
type = 1
if self.parent and self.parent.curr_state>1: #播放自定义音频
frame = self.parent.get_audio_stream(self.parent.curr_state)
type = self.parent.curr_state
else:
frame = np.zeros(self.chunk, dtype=np.float32)
type = 1
return frame,type

81
basereal.py Normal file
View File

@ -0,0 +1,81 @@
import math
import torch
import numpy as np
import os
import time
import cv2
import glob
import pickle
import copy
import queue
from queue import Queue
from threading import Thread, Event
from io import BytesIO
import soundfile as sf
from tqdm import tqdm
def read_imgs(img_list):
frames = []
print('reading images...')
for img_path in tqdm(img_list):
frame = cv2.imread(img_path)
frames.append(frame)
return frames
class BaseReal:
def __init__(self, opt):
self.opt = opt
self.sample_rate = 16000
self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000)
self.curr_state=0
self.custom_img_cycle = {}
self.custom_audio_cycle = {}
self.custom_audio_index = {}
self.custom_index = {}
self.custom_opt = {}
self.__loadcustom()
def __loadcustom(self):
for item in self.opt.customopt:
print(item)
input_img_list = glob.glob(os.path.join(item['imgpath'], '*.[jpJP][pnPN]*[gG]'))
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
self.custom_img_cycle[item['audiotype']] = read_imgs(input_img_list)
self.custom_audio_cycle[item['audiotype']], sample_rate = sf.read(item['audiopath'], dtype='float32')
self.custom_audio_index[item['audiotype']] = 0
self.custom_index[item['audiotype']] = 0
self.custom_opt[item['audiotype']] = item
def mirror_index(self,size, index):
#size = len(self.coord_list_cycle)
turn = index // size
res = index % size
if turn % 2 == 0:
return res
else:
return size - res - 1
def get_audio_stream(self,audiotype):
idx = self.custom_audio_index[audiotype]
stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
self.custom_audio_index[audiotype] += self.chunk
if self.custom_audio_index[audiotype]>=stream.shape[0]:
self.curr_state = 1 #当前视频不循环播放,切换到静音状态
return stream
def set_curr_state(self,audiotype, reinit):
self.curr_state = audiotype
if reinit:
self.custom_audio_index[audiotype] = 0
self.custom_index[audiotype] = 0
# def process_custom(self,audiotype:int,idx:int):
# if self.curr_state!=audiotype: #从推理切到口播
# if idx in self.switch_pos: #在卡点位置可以切换
# self.curr_state=audiotype
# self.custom_index=0
# else:
# self.custom_index+=1

7
data/custom_config.json Normal file
View File

@ -0,0 +1,7 @@
[
{
"audiotype":2,
"imgpath":"data/customvideo/image",
"audiopath":"data/customvideo/audio.wav"
}
]

View File

@ -23,8 +23,8 @@ from ttsreal import EdgeTTS,VoitsTTS,XTTS
from lipasr import LipASR
import asyncio
from av import AudioFrame, VideoFrame
from wav2lip.models import Wav2Lip
from basereal import BaseReal
from tqdm import tqdm
@ -143,9 +143,10 @@ def inference(render_event,batch_size,face_imgs_path,audio_feat_queue,audio_out_
print('musereal inference processor stop')
@torch.no_grad()
class LipReal:
class LipReal(BaseReal):
def __init__(self, opt):
self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
super().__init__(opt)
#self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
self.W = opt.W
self.H = opt.H
@ -163,7 +164,7 @@ class LipReal:
#self.__loadmodels()
self.__loadavatar()
self.asr = LipASR(opt)
self.asr = LipASR(opt,self)
self.asr.warm_up()
if opt.tts == "edgetts":
self.tts = EdgeTTS(opt,self)
@ -213,8 +214,16 @@ class LipReal:
res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
except queue.Empty:
continue
if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据只需要取fullimg
combine_frame = self.frame_list_cycle[idx]
if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据只需要取fullimg
audiotype = audio_frames[0][1]
if self.custom_index.get(audiotype) is not None: #有自定义视频
mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
combine_frame = self.custom_img_cycle[audiotype][mirindex]
self.custom_index[audiotype] += 1
# if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
# self.curr_state = 1 #当前视频不循环播放,切换到静音状态
else:
combine_frame = self.frame_list_cycle[idx]
else:
bbox = self.coord_list_cycle[idx]
combine_frame = copy.deepcopy(self.frame_list_cycle[idx])

View File

@ -15,7 +15,7 @@ class VllmGPT:
self.__URL = "http://{}:{}/v1/completions".format(self.host, self.port)
self.__URL2 = "http://{}:{}/v1/chat/completions".format(self.host, self.port)
def question(self,cont):
def chat(self,cont):
chat_list = []
# contentdb = content_db.new_instance()
# list = contentdb.get_list('all','desc',11)
@ -77,5 +77,5 @@ class VllmGPT:
if __name__ == "__main__":
vllm = VllmGPT('192.168.1.3','8101')
req = vllm.question("你叫什么名字啊今年多大了")
req = vllm.chat("你叫什么名字啊今年多大了")
print(req)

113
web/webrtcapi-custom.html Normal file
View File

@ -0,0 +1,113 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>WebRTC webcam</title>
<style>
button {
padding: 8px 16px;
}
video {
width: 100%;
}
.option {
margin-bottom: 8px;
}
#media {
max-width: 1280px;
}
</style>
</head>
<body>
<div class="option">
<input id="use-stun" type="checkbox"/>
<label for="use-stun">Use STUN server</label>
</div>
<button id="start" onclick="start()">Start</button>
<button id="stop" style="display: none" onclick="stop()">Stop</button>
<input type="hidden" id="sessionid" value="0">
<form class="form-inline" id="echo-form">
<div class="form-group">
<p>input text</p>
<textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
</div>
<button type="submit" class="btn btn-default">Send</button>
</form>
<div id="media">
<h2>Media</h2>
<audio id="audio" autoplay="true"></audio>
<video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
</div>
<button id="custom" onclick="custom()">切换视频</button>
<input type="text" id="audiotype" value="0">
<script src="client.js"></script>
<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
</body>
<script type="text/javascript" charset="utf-8">
$(document).ready(function() {
// var host = window.location.hostname
// var ws = new WebSocket("ws://"+host+":8000/humanecho");
// //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
// ws.onopen = function() {
// console.log('Connected');
// };
// ws.onmessage = function(e) {
// console.log('Received: ' + e.data);
// data = e
// var vid = JSON.parse(data.data);
// console.log(typeof(vid),vid)
// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
// };
// ws.onclose = function(e) {
// console.log('Closed');
// };
$('#echo-form').on('submit', function(e) {
e.preventDefault();
var message = $('#message').val();
console.log('Sending: ' + message);
console.log('sessionid: ',document.getElementById('sessionid').value);
fetch('/human', {
body: JSON.stringify({
text: message,
type: 'echo',
interrupt: true,
sessionid:parseInt(document.getElementById('sessionid').value),
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
//ws.send(message);
$('#message').val('');
});
function custom() {
fetch('/set_audiotype', {
body: JSON.stringify({
audiotype: parseInt(document.getElementById('audiotype').value),
reinit: false,
sessionid:parseInt(document.getElementById('sessionid').value),
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
}
});
</script>
</html>

View File

@ -30,7 +30,7 @@
</div>
<button id="start" onclick="start()">Start</button>
<button id="stop" style="display: none" onclick="stop()">Stop</button>
<input type="hidden" id="sessionid" value="1234">
<input type="hidden" id="sessionid" value="0">
<form class="form-inline" id="echo-form">
<div class="form-group">
<p>input text</p>