diff --git a/README.md b/README.md index 8c3e8ae..0ec1d9f 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,11 @@ cd MuseTalk 修改configs/inference/realtime.yaml,将preparation改为True python -m scripts.realtime_inference --inference_config configs/inference/realtime.yaml 运行后将results/avatars下文件拷到本项目的data/avatars下 +方法二 +执行 +cd musetalk +python simple_musetalk.py --avatar_id 4 --file D:\\ok\\test.mp4 +支持视频和图片生成 会自动生成到data的avatars目录下 ``` ### 3.10 模型用wav2lip diff --git a/musetalk/simple_musetalk.py b/musetalk/simple_musetalk.py new file mode 100644 index 0000000..97b8a36 --- /dev/null +++ b/musetalk/simple_musetalk.py @@ -0,0 +1,348 @@ +import argparse +import glob +import json +import os +import pickle +import shutil + +import cv2 +import numpy as np +import torch +import torchvision.transforms as transforms +from PIL import Image +from diffusers import AutoencoderKL +from face_alignment import NetworkSize +from mmpose.apis import inference_topdown, init_model +from mmpose.structures import merge_data_samples +from tqdm import tqdm + +try: + from utils.face_parsing import FaceParsing +except ModuleNotFoundError: + from musetalk.utils.face_parsing import FaceParsing + + +def video2imgs(vid_path, save_path, ext='.png', cut_frame=10000000): + cap = cv2.VideoCapture(vid_path) + count = 0 + while True: + if count > cut_frame: + break + ret, frame = cap.read() + if ret: + cv2.imwrite(f"{save_path}/{count:08d}.png", frame) + count += 1 + else: + break + + +def read_imgs(img_list): + frames = [] + print('reading images...') + for img_path in tqdm(img_list): + frame = cv2.imread(img_path) + frames.append(frame) + return frames + + +def get_landmark_and_bbox(img_list, upperbondrange=0): + frames = read_imgs(img_list) + batch_size_fa = 1 + batches = [frames[i:i + batch_size_fa] for i in range(0, len(frames), batch_size_fa)] + coords_list = [] + landmarks = [] + if upperbondrange != 0: + print('get key_landmark and face bounding boxes with the bbox_shift:', upperbondrange) + else: + print('get key_landmark and face bounding boxes with the default value') + average_range_minus = [] + average_range_plus = [] + coord_placeholder = (0.0, 0.0, 0.0, 0.0) + for fb in tqdm(batches): + results = inference_topdown(model, np.asarray(fb)[0]) + results = merge_data_samples(results) + keypoints = results.pred_instances.keypoints + face_land_mark = keypoints[0][23:91] + face_land_mark = face_land_mark.astype(np.int32) + + # get bounding boxes by face detetion + bbox = fa.get_detections_for_batch(np.asarray(fb)) + + # adjust the bounding box refer to landmark + # Add the bounding box to a tuple and append it to the coordinates list + for j, f in enumerate(bbox): + if f is None: # no face in the image + coords_list += [coord_placeholder] + continue + + half_face_coord = face_land_mark[29] # np.mean([face_land_mark[28], face_land_mark[29]], axis=0) + range_minus = (face_land_mark[30] - face_land_mark[29])[1] + range_plus = (face_land_mark[29] - face_land_mark[28])[1] + average_range_minus.append(range_minus) + average_range_plus.append(range_plus) + if upperbondrange != 0: + half_face_coord[1] = upperbondrange + half_face_coord[1] # 手动调整 + 向下(偏29) - 向上(偏28) + half_face_dist = np.max(face_land_mark[:, 1]) - half_face_coord[1] + upper_bond = half_face_coord[1] - half_face_dist + + f_landmark = ( + np.min(face_land_mark[:, 0]), int(upper_bond), np.max(face_land_mark[:, 0]), + np.max(face_land_mark[:, 1])) + x1, y1, x2, y2 = f_landmark + + if y2 - y1 <= 0 or x2 - x1 <= 0 or x1 < 0: # if the landmark bbox is not suitable, reuse the bbox + coords_list += [f] + w, h = f[2] - f[0], f[3] - f[1] + print("error bbox:", f) + else: + coords_list += [f_landmark] + return coords_list, frames + + +class FaceAlignment: + def __init__(self, landmarks_type, network_size=NetworkSize.LARGE, + device='cuda', flip_input=False, face_detector='sfd', verbose=False): + self.device = device + self.flip_input = flip_input + self.landmarks_type = landmarks_type + self.verbose = verbose + + network_size = int(network_size) + if 'cuda' in device: + torch.backends.cudnn.benchmark = True + # torch.backends.cuda.matmul.allow_tf32 = False + # torch.backends.cudnn.benchmark = True + # torch.backends.cudnn.deterministic = False + # torch.backends.cudnn.allow_tf32 = True + print('cuda start') + + # Get the face detector + face_detector_module = __import__('face_detection.detection.' + face_detector, + globals(), locals(), [face_detector], 0) + + self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose) + + def get_detections_for_batch(self, images): + images = images[..., ::-1] + detected_faces = self.face_detector.detect_from_batch(images.copy()) + results = [] + + for i, d in enumerate(detected_faces): + if len(d) == 0: + results.append(None) + continue + d = d[0] + d = np.clip(d, 0, None) + + x1, y1, x2, y2 = map(int, d[:-1]) + results.append((x1, y1, x2, y2)) + return results + + +def get_mask_tensor(): + """ + Creates a mask tensor for image processing. + :return: A mask tensor. + """ + mask_tensor = torch.zeros((256, 256)) + mask_tensor[:256 // 2, :] = 1 + mask_tensor[mask_tensor < 0.5] = 0 + mask_tensor[mask_tensor >= 0.5] = 1 + return mask_tensor + + +def preprocess_img(img_name, half_mask=False): + window = [] + if isinstance(img_name, str): + window_fnames = [img_name] + for fname in window_fnames: + img = cv2.imread(fname) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = cv2.resize(img, (256, 256), + interpolation=cv2.INTER_LANCZOS4) + window.append(img) + else: + img = cv2.cvtColor(img_name, cv2.COLOR_BGR2RGB) + window.append(img) + x = np.asarray(window) / 255. + x = np.transpose(x, (3, 0, 1, 2)) + x = torch.squeeze(torch.FloatTensor(x)) + if half_mask: + x = x * (get_mask_tensor() > 0.5) + normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + x = normalize(x) + x = x.unsqueeze(0) # [1, 3, 256, 256] torch tensor + x = x.to(device) + return x + + +def encode_latents(image): + with torch.no_grad(): + init_latent_dist = vae.encode(image.to(vae.dtype)).latent_dist + init_latents = vae.config.scaling_factor * init_latent_dist.sample() + return init_latents + + +def get_latents_for_unet(img): + ref_image = preprocess_img(img, half_mask=True) # [1, 3, 256, 256] RGB, torch tensor + masked_latents = encode_latents(ref_image) # [1, 4, 32, 32], torch tensor + ref_image = preprocess_img(img, half_mask=False) # [1, 3, 256, 256] RGB, torch tensor + ref_latents = encode_latents(ref_image) # [1, 4, 32, 32], torch tensor + latent_model_input = torch.cat([masked_latents, ref_latents], dim=1) + return latent_model_input + + +def get_crop_box(box, expand): + x, y, x1, y1 = box + x_c, y_c = (x + x1) // 2, (y + y1) // 2 + w, h = x1 - x, y1 - y + s = int(max(w, h) // 2 * expand) + crop_box = [x_c - s, y_c - s, x_c + s, y_c + s] + return crop_box, s + + +def face_seg(image): + seg_image = fp(image) + if seg_image is None: + print("error, no person_segment") + return None + + seg_image = seg_image.resize(image.size) + return seg_image + + +def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5, expand=1.2): + body = Image.fromarray(image[:, :, ::-1]) + + x, y, x1, y1 = face_box + # print(x1-x,y1-y) + crop_box, s = get_crop_box(face_box, expand) + x_s, y_s, x_e, y_e = crop_box + + face_large = body.crop(crop_box) + ori_shape = face_large.size + + mask_image = face_seg(face_large) + mask_small = mask_image.crop((x - x_s, y - y_s, x1 - x_s, y1 - y_s)) + mask_image = Image.new('L', ori_shape, 0) + mask_image.paste(mask_small, (x - x_s, y - y_s, x1 - x_s, y1 - y_s)) + + # keep upper_boundary_ratio of talking area + width, height = mask_image.size + top_boundary = int(height * upper_boundary_ratio) + modified_mask_image = Image.new('L', ori_shape, 0) + modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary)) + + blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1 + mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0) + return mask_array, crop_box + + +##todo 简单根据文件后缀判断 要更精确的可以自己修改 使用 magic +def is_video_file(file_path): + video_exts = ['.mp4', '.mkv', '.flv', '.avi', '.mov'] # 这里列出了一些常见的视频文件扩展名,可以根据需要添加更多 + file_ext = os.path.splitext(file_path)[1].lower() # 获取文件扩展名并转换为小写 + return file_ext in video_exts + + +def create_dir(dir_path): + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + +current_dir = os.path.dirname(os.path.abspath(__file__)) + + +def create_musetalk_human(file, avatar_id): + # 保存文件设置 可以不动 + save_path = os.path.join(current_dir, f'../data/avatars/avator_{avatar_id}') + save_full_path = os.path.join(current_dir, f'../data/avatars/avator_{avatar_id}/full_imgs') + create_dir(save_path) + create_dir(save_full_path) + mask_out_path = os.path.join(current_dir, f'../data/avatars/avator_{avatar_id}/mask') + create_dir(mask_out_path) + + # 模型 + mask_coords_path = os.path.join(current_dir, f'{save_path}/mask_coords.pkl') + coords_path = os.path.join(current_dir, f'{save_path}/coords.pkl') + latents_out_path = os.path.join(current_dir, f'{save_path}/latents.pt') + + with open(os.path.join(current_dir, f'{save_path}/avator_info.json'), "w") as f: + json.dump({ + "avatar_id": avatar_id, + "video_path": file, + "bbox_shift": 5 + }, f) + + if os.path.isfile(file): + if is_video_file(file): + video2imgs(file, save_full_path, ext='png') + else: + shutil.copyfile(file, f"{save_full_path}/{os.path.basename(file)}") + else: + files = os.listdir(file) + files.sort() + files = [file for file in files if file.split(".")[-1] == "png"] + for filename in files: + shutil.copyfile(f"{file}/{filename}", f"{save_full_path}/{filename}") + input_img_list = sorted(glob.glob(os.path.join(save_full_path, '*.[jpJP][pnPN]*[gG]'))) + print("extracting landmarks...") + coord_list, frame_list = get_landmark_and_bbox(input_img_list, 5) + input_latent_list = [] + idx = -1 + # maker if the bbox is not sufficient + coord_placeholder = (0.0, 0.0, 0.0, 0.0) + for bbox, frame in zip(coord_list, frame_list): + idx = idx + 1 + if bbox == coord_placeholder: + continue + x1, y1, x2, y2 = bbox + crop_frame = frame[y1:y2, x1:x2] + resized_crop_frame = cv2.resize(crop_frame, (256, 256), interpolation=cv2.INTER_LANCZOS4) + latents = get_latents_for_unet(resized_crop_frame) + input_latent_list.append(latents) + + frame_list_cycle = frame_list + frame_list[::-1] + coord_list_cycle = coord_list + coord_list[::-1] + input_latent_list_cycle = input_latent_list + input_latent_list[::-1] + mask_coords_list_cycle = [] + mask_list_cycle = [] + for i, frame in enumerate(tqdm(frame_list_cycle)): + cv2.imwrite(f"{save_full_path}/{str(i).zfill(8)}.png", frame) + face_box = coord_list_cycle[i] + mask, crop_box = get_image_prepare_material(frame, face_box) + cv2.imwrite(f"{mask_out_path}/{str(i).zfill(8)}.png", mask) + mask_coords_list_cycle += [crop_box] + mask_list_cycle.append(mask) + + with open(mask_coords_path, 'wb') as f: + pickle.dump(mask_coords_list_cycle, f) + + with open(coords_path, 'wb') as f: + pickle.dump(coord_list_cycle, f) + torch.save(input_latent_list_cycle, os.path.join(latents_out_path)) + + +# initialize the mmpose model +device = "cuda" if torch.cuda.is_available() else "cpu" +fa = FaceAlignment(1, flip_input=False, device=device) +config_file = os.path.join(current_dir, 'utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py') +checkpoint_file = os.path.abspath(os.path.join(current_dir, '../models/dwpose/dw-ll_ucoco_384.pth')) +model = init_model(config_file, checkpoint_file, device=device) +vae = AutoencoderKL.from_pretrained(os.path.abspath(os.path.join(current_dir, '../models/sd-vae-ft-mse'))) +vae.to(device) +fp = FaceParsing(os.path.abspath(os.path.join(current_dir, '../models/face-parse-bisent/resnet18-5c106cde.pth')), + os.path.abspath(os.path.join(current_dir, '../models/face-parse-bisent/79999_iter.pth'))) +if __name__ == '__main__': + # 视频文件地址 + parser = argparse.ArgumentParser() + parser.add_argument("--file", + type=str, + default=r'D:\ok\00000000.png', + ) + parser.add_argument("--avatar_id", + type=str, + default='3', + ) + args = parser.parse_args() + create_musetalk_human(args.file, args.avatar_id) diff --git a/musetalk/utils/face_parsing/__init__.py b/musetalk/utils/face_parsing/__init__.py index fc963a3..520593e 100755 --- a/musetalk/utils/face_parsing/__init__.py +++ b/musetalk/utils/face_parsing/__init__.py @@ -8,17 +8,18 @@ from .model import BiSeNet import torchvision.transforms as transforms class FaceParsing(): - def __init__(self): - self.net = self.model_init() + def __init__(self,resnet_path='./models/face-parse-bisent/resnet18-5c106cde.pth', + model_pth='./models/face-parse-bisent/79999_iter.pth'): + self.net = self.model_init(resnet_path,model_pth) self.preprocess = self.image_preprocess() - def model_init(self, - resnet_path='./models/face-parse-bisent/resnet18-5c106cde.pth', - model_pth='./models/face-parse-bisent/79999_iter.pth'): + def model_init(self, + resnet_path, + model_pth): net = BiSeNet(resnet_path) if torch.cuda.is_available(): net.cuda() - net.load_state_dict(torch.load(model_pth)) + net.load_state_dict(torch.load(model_pth)) else: net.load_state_dict(torch.load(model_pth, map_location=torch.device('cpu'))) net.eval() @@ -53,4 +54,4 @@ if __name__ == "__main__": fp = FaceParsing() segmap = fp('154_small.png') segmap.save('res.png') - +