diff --git a/README.md b/README.md index 783fc06..7f64ec4 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,12 @@ A streaming digital human based on the Ernerf model, realize audio video synch [![Watch the video]](/assets/demo.mp4) +## Features +1. 支持声音克隆 +2. 支持大模型对话 +3. 支持多种音频特征驱动:wav2vec、hubert +4. 支持全身视频拼接 + ## 1. Installation Tested on Ubuntu 20.04, Python3.10, Pytorch 1.12 and CUDA 11.3 @@ -53,7 +59,7 @@ nginx 用浏览器打开http://serverip/echo.html, 在文本框输入任意文字,提交。数字人播报该段文字 -## 3. 更多使用 +## 3. More Usage ### 3.1 使用LLM模型进行数字人对话 目前借鉴数字人对话系统[LinlyTalker](https://github.com/Kedreamix/Linly-Talker)的方式,LLM模型支持Chatgpt,Qwen和GeminiPro。需要在app.py中填入自己的api_key。 @@ -97,7 +103,8 @@ ffmpeg -i fullbody.mp4 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/fullbody/i python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 100 --fullbody_offset_y 5 --fullbody_width 580 --fullbody_height 1080 --W 400 --H 400 ``` - --fullbody_width、--fullbody_height 全身视频的宽、高 -- --W、--H 训练视频的宽、高 +- --W、--H 训练视频的宽、高 +- ernerf训练第三步torso如果训练的不好,在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs,torso不用模型推理,直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。 ## 4. Docker Run 不需要第1步的安装,直接运行。 diff --git a/app.py b/app.py index ef3fcfb..0503d8c 100644 --- a/app.py +++ b/app.py @@ -159,6 +159,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source") parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area") + parser.add_argument('--torso_imgs', type=str, default="", help="torso images path") parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --exp_eye") @@ -296,7 +297,8 @@ if __name__ == '__main__': opt.exp_eye = True opt.smooth_eye = True - opt.torso = True + if opt.torso_imgs=='': #no img,use model output + opt.torso = True # assert opt.cuda_ray, "Only support CUDA ray mode." opt.asr = True @@ -305,6 +307,7 @@ if __name__ == '__main__': # assert opt.patch_size > 16, "patch_size should > 16 to run LPIPS loss." assert opt.num_rays % (opt.patch_size ** 2) == 0, "patch_size ** 2 should be dividable by num_rays." seed_everything(opt.seed) + print(opt) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = NeRFNetwork(opt) diff --git a/main.py b/main.py index 6e432ea..cc8a0b5 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ import torch import argparse -from nerf_triplane.provider import NeRFDataset +from nerf_triplane.provider import NeRFDataset,NeRFDataset_Test from nerf_triplane.utils import * from nerf_triplane.network import NeRFNetwork @@ -24,6 +24,9 @@ if __name__ == '__main__': parser.add_argument('--workspace', type=str, default='workspace') parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source") + parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area") + ### training options parser.add_argument('--iters', type=int, default=200000, help="training iters") parser.add_argument('--lr', type=float, default=1e-2, help="initial learning rate") @@ -47,7 +50,7 @@ if __name__ == '__main__': ### network backbone options parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training") - parser.add_argument('--bg_img', type=str, default='', help="background image") + parser.add_argument('--bg_img', type=str, default='white', help="background image") parser.add_argument('--fbg', action='store_true', help="frame-wise bg") parser.add_argument('--exp_eye', action='store_true', help="explicitly control the eyes") parser.add_argument('--fix_eye', type=float, default=-1, help="fixed eye area, negative to disable, set to 0-0.3 for a reasonable eye") @@ -182,7 +185,7 @@ if __name__ == '__main__': trainer = Trainer('ngp', opt, model, device=device, workspace=opt.workspace, criterion=criterion, fp16=opt.fp16, metrics=metrics, use_checkpoint=opt.ckpt) if opt.test_train: - test_set = NeRFDataset(opt, device=device, type='train') + test_set = NeRFDataset(opt, device=device, type='train') # a manual fix to test on the training dataset test_set.training = False test_set.num_rays = -1 diff --git a/nerf_triplane/provider.py b/nerf_triplane/provider.py index 727836c..4f0a704 100644 --- a/nerf_triplane/provider.py +++ b/nerf_triplane/provider.py @@ -98,6 +98,7 @@ class NeRFDataset_Test: self.training = False self.num_rays = -1 + self.preload = opt.preload # 0 = disk, 1 = cpu, 2 = gpu # load nerf-compatible format data. @@ -148,6 +149,7 @@ class NeRFDataset_Test: self.poses = [] self.auds = [] self.eye_area = [] + self.torso_img = [] for f in tqdm.tqdm(frames, desc=f'Loading data'): @@ -172,6 +174,29 @@ class NeRFDataset_Test: # area = area + np.random.rand() / 10 self.eye_area.append(area) + + # load frame-wise bg + + if self.opt.torso_imgs!='': + torso_img_path = os.path.join(self.opt.torso_imgs, str(f['img_id']) + '.png') + + if self.preload > 0: + torso_img = cv2.imread(torso_img_path, cv2.IMREAD_UNCHANGED) # [H, W, 4] + torso_img = cv2.cvtColor(torso_img, cv2.COLOR_BGRA2RGBA) + torso_img = torso_img.astype(np.float32) / 255 # [H, W, 3/4] + + self.torso_img.append(torso_img) + else: + self.torso_img.append(torso_img_path) + + if self.opt.torso_imgs!='': + if self.preload > 0: + self.torso_img = torch.from_numpy(np.stack(self.torso_img, axis=0)) # [N, H, W, C] + else: + self.torso_img = np.array(self.torso_img) + if self.preload > 1: #gpu + self.torso_img = self.torso_img.to(torch.half).to(self.device) + # load pre-extracted background image (should be the same size as training image...) @@ -209,6 +234,9 @@ class NeRFDataset_Test: self.bg_img = torch.from_numpy(self.bg_img) + if self.preload > 1 or self.opt.torso_imgs=='': #gpu + self.bg_img = self.bg_img.to(torch.half).to(self.device) + if self.opt.exp_eye: self.eye_area = np.array(self.eye_area, dtype=np.float32) # [N] print(f'[INFO] eye_area: {self.eye_area.min()} - {self.eye_area.max()}') @@ -229,8 +257,6 @@ class NeRFDataset_Test: if self.auds is not None: self.auds = self.auds.to(self.device) - - self.bg_img = self.bg_img.to(torch.half).to(self.device) if self.opt.exp_eye: self.eye_area = self.eye_area.to(self.device) @@ -285,8 +311,23 @@ class NeRFDataset_Test: results['eye'] = self.eye_area[index].to(self.device) # [1] else: results['eye'] = None - - bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device) + + # load bg + if self.opt.torso_imgs!='': + bg_torso_img = self.torso_img[index] + if self.preload == 0: # on the fly loading + bg_torso_img = cv2.imread(bg_torso_img[0], cv2.IMREAD_UNCHANGED) # [H, W, 4] + bg_torso_img = cv2.cvtColor(bg_torso_img, cv2.COLOR_BGRA2RGBA) + bg_torso_img = bg_torso_img.astype(np.float32) / 255 # [H, W, 3/4] + bg_torso_img = torch.from_numpy(bg_torso_img).unsqueeze(0) + bg_torso_img = bg_torso_img[..., :3] * bg_torso_img[..., 3:] + self.bg_img * (1 - bg_torso_img[..., 3:]) + bg_torso_img = bg_torso_img.view(B, -1, 3).to(self.device) + if not self.opt.torso: + bg_img = bg_torso_img + else: + bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device) + else: + bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device) results['bg_color'] = bg_img @@ -341,8 +382,30 @@ class NeRFDataset: # load nerf-compatible format data. - with open(opt.pose, 'r') as f: - transform = json.load(f) + # load all splits (train/valid/test) + if type == 'all': + transform_paths = glob.glob(os.path.join(self.root_path, '*.json')) + transform = None + for transform_path in transform_paths: + with open(transform_path, 'r') as f: + tmp_transform = json.load(f) + if transform is None: + transform = tmp_transform + else: + transform['frames'].extend(tmp_transform['frames']) + # load train and val split + elif type == 'trainval': + with open(os.path.join(self.root_path, f'transforms_train.json'), 'r') as f: + transform = json.load(f) + with open(os.path.join(self.root_path, f'transforms_val.json'), 'r') as f: + transform_val = json.load(f) + transform['frames'].extend(transform_val['frames']) + # only load one specified split + else: + # no test, use val as test + _split = 'val' if type == 'test' else type + with open(os.path.join(self.root_path, f'transforms_{_split}.json'), 'r') as f: + transform = json.load(f) # load image size if 'h' in transform and 'w' in transform: @@ -371,6 +434,10 @@ class NeRFDataset: aud_features = np.load(os.path.join(self.root_path, 'aud_eo.npy')) elif 'deepspeech' in self.opt.asr_model: aud_features = np.load(os.path.join(self.root_path, 'aud_ds.npy')) + # elif 'hubert_cn' in self.opt.asr_model: + # aud_features = np.load(os.path.join(self.root_path, 'aud_hu_cn.npy')) + elif 'hubert' in self.opt.asr_model: + aud_features = np.load(os.path.join(self.root_path, 'aud_hu.npy')) else: aud_features = np.load(os.path.join(self.root_path, 'aud.npy')) # cross-driven extracted features.