improve fullbody
This commit is contained in:
parent
ace4495631
commit
250cbaa587
11
README.md
11
README.md
|
@ -3,6 +3,12 @@ A streaming digital human based on the Ernerf model, realize audio video synch
|
||||||
|
|
||||||
[![Watch the video]](/assets/demo.mp4)
|
[![Watch the video]](/assets/demo.mp4)
|
||||||
|
|
||||||
|
## Features
|
||||||
|
1. 支持声音克隆
|
||||||
|
2. 支持大模型对话
|
||||||
|
3. 支持多种音频特征驱动:wav2vec、hubert
|
||||||
|
4. 支持全身视频拼接
|
||||||
|
|
||||||
## 1. Installation
|
## 1. Installation
|
||||||
|
|
||||||
Tested on Ubuntu 20.04, Python3.10, Pytorch 1.12 and CUDA 11.3
|
Tested on Ubuntu 20.04, Python3.10, Pytorch 1.12 and CUDA 11.3
|
||||||
|
@ -53,7 +59,7 @@ nginx
|
||||||
|
|
||||||
用浏览器打开http://serverip/echo.html, 在文本框输入任意文字,提交。数字人播报该段文字
|
用浏览器打开http://serverip/echo.html, 在文本框输入任意文字,提交。数字人播报该段文字
|
||||||
|
|
||||||
## 3. 更多使用
|
## 3. More Usage
|
||||||
### 3.1 使用LLM模型进行数字人对话
|
### 3.1 使用LLM模型进行数字人对话
|
||||||
|
|
||||||
目前借鉴数字人对话系统[LinlyTalker](https://github.com/Kedreamix/Linly-Talker)的方式,LLM模型支持Chatgpt,Qwen和GeminiPro。需要在app.py中填入自己的api_key。
|
目前借鉴数字人对话系统[LinlyTalker](https://github.com/Kedreamix/Linly-Talker)的方式,LLM模型支持Chatgpt,Qwen和GeminiPro。需要在app.py中填入自己的api_key。
|
||||||
|
@ -97,7 +103,8 @@ ffmpeg -i fullbody.mp4 -vf fps=25 -qmin 1 -q:v 1 -start_number 0 data/fullbody/i
|
||||||
python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 100 --fullbody_offset_y 5 --fullbody_width 580 --fullbody_height 1080 --W 400 --H 400
|
python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 100 --fullbody_offset_y 5 --fullbody_width 580 --fullbody_height 1080 --W 400 --H 400
|
||||||
```
|
```
|
||||||
- --fullbody_width、--fullbody_height 全身视频的宽、高
|
- --fullbody_width、--fullbody_height 全身视频的宽、高
|
||||||
- --W、--H 训练视频的宽、高
|
- --W、--H 训练视频的宽、高
|
||||||
|
- ernerf训练第三步torso如果训练的不好,在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs,torso不用模型推理,直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。
|
||||||
|
|
||||||
## 4. Docker Run
|
## 4. Docker Run
|
||||||
不需要第1步的安装,直接运行。
|
不需要第1步的安装,直接运行。
|
||||||
|
|
5
app.py
5
app.py
|
@ -159,6 +159,7 @@ if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source")
|
parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source")
|
||||||
parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area")
|
parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area")
|
||||||
|
parser.add_argument('--torso_imgs', type=str, default="", help="torso images path")
|
||||||
|
|
||||||
parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --exp_eye")
|
parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --exp_eye")
|
||||||
|
|
||||||
|
@ -296,7 +297,8 @@ if __name__ == '__main__':
|
||||||
opt.exp_eye = True
|
opt.exp_eye = True
|
||||||
opt.smooth_eye = True
|
opt.smooth_eye = True
|
||||||
|
|
||||||
opt.torso = True
|
if opt.torso_imgs=='': #no img,use model output
|
||||||
|
opt.torso = True
|
||||||
|
|
||||||
# assert opt.cuda_ray, "Only support CUDA ray mode."
|
# assert opt.cuda_ray, "Only support CUDA ray mode."
|
||||||
opt.asr = True
|
opt.asr = True
|
||||||
|
@ -305,6 +307,7 @@ if __name__ == '__main__':
|
||||||
# assert opt.patch_size > 16, "patch_size should > 16 to run LPIPS loss."
|
# assert opt.patch_size > 16, "patch_size should > 16 to run LPIPS loss."
|
||||||
assert opt.num_rays % (opt.patch_size ** 2) == 0, "patch_size ** 2 should be dividable by num_rays."
|
assert opt.num_rays % (opt.patch_size ** 2) == 0, "patch_size ** 2 should be dividable by num_rays."
|
||||||
seed_everything(opt.seed)
|
seed_everything(opt.seed)
|
||||||
|
print(opt)
|
||||||
|
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
model = NeRFNetwork(opt)
|
model = NeRFNetwork(opt)
|
||||||
|
|
9
main.py
9
main.py
|
@ -1,7 +1,7 @@
|
||||||
import torch
|
import torch
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from nerf_triplane.provider import NeRFDataset
|
from nerf_triplane.provider import NeRFDataset,NeRFDataset_Test
|
||||||
from nerf_triplane.utils import *
|
from nerf_triplane.utils import *
|
||||||
from nerf_triplane.network import NeRFNetwork
|
from nerf_triplane.network import NeRFNetwork
|
||||||
|
|
||||||
|
@ -24,6 +24,9 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--workspace', type=str, default='workspace')
|
parser.add_argument('--workspace', type=str, default='workspace')
|
||||||
parser.add_argument('--seed', type=int, default=0)
|
parser.add_argument('--seed', type=int, default=0)
|
||||||
|
|
||||||
|
parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source")
|
||||||
|
parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area")
|
||||||
|
|
||||||
### training options
|
### training options
|
||||||
parser.add_argument('--iters', type=int, default=200000, help="training iters")
|
parser.add_argument('--iters', type=int, default=200000, help="training iters")
|
||||||
parser.add_argument('--lr', type=float, default=1e-2, help="initial learning rate")
|
parser.add_argument('--lr', type=float, default=1e-2, help="initial learning rate")
|
||||||
|
@ -47,7 +50,7 @@ if __name__ == '__main__':
|
||||||
### network backbone options
|
### network backbone options
|
||||||
parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
|
parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
|
||||||
|
|
||||||
parser.add_argument('--bg_img', type=str, default='', help="background image")
|
parser.add_argument('--bg_img', type=str, default='white', help="background image")
|
||||||
parser.add_argument('--fbg', action='store_true', help="frame-wise bg")
|
parser.add_argument('--fbg', action='store_true', help="frame-wise bg")
|
||||||
parser.add_argument('--exp_eye', action='store_true', help="explicitly control the eyes")
|
parser.add_argument('--exp_eye', action='store_true', help="explicitly control the eyes")
|
||||||
parser.add_argument('--fix_eye', type=float, default=-1, help="fixed eye area, negative to disable, set to 0-0.3 for a reasonable eye")
|
parser.add_argument('--fix_eye', type=float, default=-1, help="fixed eye area, negative to disable, set to 0-0.3 for a reasonable eye")
|
||||||
|
@ -182,7 +185,7 @@ if __name__ == '__main__':
|
||||||
trainer = Trainer('ngp', opt, model, device=device, workspace=opt.workspace, criterion=criterion, fp16=opt.fp16, metrics=metrics, use_checkpoint=opt.ckpt)
|
trainer = Trainer('ngp', opt, model, device=device, workspace=opt.workspace, criterion=criterion, fp16=opt.fp16, metrics=metrics, use_checkpoint=opt.ckpt)
|
||||||
|
|
||||||
if opt.test_train:
|
if opt.test_train:
|
||||||
test_set = NeRFDataset(opt, device=device, type='train')
|
test_set = NeRFDataset(opt, device=device, type='train')
|
||||||
# a manual fix to test on the training dataset
|
# a manual fix to test on the training dataset
|
||||||
test_set.training = False
|
test_set.training = False
|
||||||
test_set.num_rays = -1
|
test_set.num_rays = -1
|
||||||
|
|
|
@ -98,6 +98,7 @@ class NeRFDataset_Test:
|
||||||
|
|
||||||
self.training = False
|
self.training = False
|
||||||
self.num_rays = -1
|
self.num_rays = -1
|
||||||
|
self.preload = opt.preload # 0 = disk, 1 = cpu, 2 = gpu
|
||||||
|
|
||||||
# load nerf-compatible format data.
|
# load nerf-compatible format data.
|
||||||
|
|
||||||
|
@ -148,6 +149,7 @@ class NeRFDataset_Test:
|
||||||
self.poses = []
|
self.poses = []
|
||||||
self.auds = []
|
self.auds = []
|
||||||
self.eye_area = []
|
self.eye_area = []
|
||||||
|
self.torso_img = []
|
||||||
|
|
||||||
for f in tqdm.tqdm(frames, desc=f'Loading data'):
|
for f in tqdm.tqdm(frames, desc=f'Loading data'):
|
||||||
|
|
||||||
|
@ -172,6 +174,29 @@ class NeRFDataset_Test:
|
||||||
# area = area + np.random.rand() / 10
|
# area = area + np.random.rand() / 10
|
||||||
|
|
||||||
self.eye_area.append(area)
|
self.eye_area.append(area)
|
||||||
|
|
||||||
|
# load frame-wise bg
|
||||||
|
|
||||||
|
if self.opt.torso_imgs!='':
|
||||||
|
torso_img_path = os.path.join(self.opt.torso_imgs, str(f['img_id']) + '.png')
|
||||||
|
|
||||||
|
if self.preload > 0:
|
||||||
|
torso_img = cv2.imread(torso_img_path, cv2.IMREAD_UNCHANGED) # [H, W, 4]
|
||||||
|
torso_img = cv2.cvtColor(torso_img, cv2.COLOR_BGRA2RGBA)
|
||||||
|
torso_img = torso_img.astype(np.float32) / 255 # [H, W, 3/4]
|
||||||
|
|
||||||
|
self.torso_img.append(torso_img)
|
||||||
|
else:
|
||||||
|
self.torso_img.append(torso_img_path)
|
||||||
|
|
||||||
|
if self.opt.torso_imgs!='':
|
||||||
|
if self.preload > 0:
|
||||||
|
self.torso_img = torch.from_numpy(np.stack(self.torso_img, axis=0)) # [N, H, W, C]
|
||||||
|
else:
|
||||||
|
self.torso_img = np.array(self.torso_img)
|
||||||
|
if self.preload > 1: #gpu
|
||||||
|
self.torso_img = self.torso_img.to(torch.half).to(self.device)
|
||||||
|
|
||||||
|
|
||||||
# load pre-extracted background image (should be the same size as training image...)
|
# load pre-extracted background image (should be the same size as training image...)
|
||||||
|
|
||||||
|
@ -209,6 +234,9 @@ class NeRFDataset_Test:
|
||||||
|
|
||||||
self.bg_img = torch.from_numpy(self.bg_img)
|
self.bg_img = torch.from_numpy(self.bg_img)
|
||||||
|
|
||||||
|
if self.preload > 1 or self.opt.torso_imgs=='': #gpu
|
||||||
|
self.bg_img = self.bg_img.to(torch.half).to(self.device)
|
||||||
|
|
||||||
if self.opt.exp_eye:
|
if self.opt.exp_eye:
|
||||||
self.eye_area = np.array(self.eye_area, dtype=np.float32) # [N]
|
self.eye_area = np.array(self.eye_area, dtype=np.float32) # [N]
|
||||||
print(f'[INFO] eye_area: {self.eye_area.min()} - {self.eye_area.max()}')
|
print(f'[INFO] eye_area: {self.eye_area.min()} - {self.eye_area.max()}')
|
||||||
|
@ -229,8 +257,6 @@ class NeRFDataset_Test:
|
||||||
|
|
||||||
if self.auds is not None:
|
if self.auds is not None:
|
||||||
self.auds = self.auds.to(self.device)
|
self.auds = self.auds.to(self.device)
|
||||||
|
|
||||||
self.bg_img = self.bg_img.to(torch.half).to(self.device)
|
|
||||||
|
|
||||||
if self.opt.exp_eye:
|
if self.opt.exp_eye:
|
||||||
self.eye_area = self.eye_area.to(self.device)
|
self.eye_area = self.eye_area.to(self.device)
|
||||||
|
@ -285,8 +311,23 @@ class NeRFDataset_Test:
|
||||||
results['eye'] = self.eye_area[index].to(self.device) # [1]
|
results['eye'] = self.eye_area[index].to(self.device) # [1]
|
||||||
else:
|
else:
|
||||||
results['eye'] = None
|
results['eye'] = None
|
||||||
|
|
||||||
bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device)
|
# load bg
|
||||||
|
if self.opt.torso_imgs!='':
|
||||||
|
bg_torso_img = self.torso_img[index]
|
||||||
|
if self.preload == 0: # on the fly loading
|
||||||
|
bg_torso_img = cv2.imread(bg_torso_img[0], cv2.IMREAD_UNCHANGED) # [H, W, 4]
|
||||||
|
bg_torso_img = cv2.cvtColor(bg_torso_img, cv2.COLOR_BGRA2RGBA)
|
||||||
|
bg_torso_img = bg_torso_img.astype(np.float32) / 255 # [H, W, 3/4]
|
||||||
|
bg_torso_img = torch.from_numpy(bg_torso_img).unsqueeze(0)
|
||||||
|
bg_torso_img = bg_torso_img[..., :3] * bg_torso_img[..., 3:] + self.bg_img * (1 - bg_torso_img[..., 3:])
|
||||||
|
bg_torso_img = bg_torso_img.view(B, -1, 3).to(self.device)
|
||||||
|
if not self.opt.torso:
|
||||||
|
bg_img = bg_torso_img
|
||||||
|
else:
|
||||||
|
bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device)
|
||||||
|
else:
|
||||||
|
bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device)
|
||||||
|
|
||||||
results['bg_color'] = bg_img
|
results['bg_color'] = bg_img
|
||||||
|
|
||||||
|
@ -341,8 +382,30 @@ class NeRFDataset:
|
||||||
|
|
||||||
# load nerf-compatible format data.
|
# load nerf-compatible format data.
|
||||||
|
|
||||||
with open(opt.pose, 'r') as f:
|
# load all splits (train/valid/test)
|
||||||
transform = json.load(f)
|
if type == 'all':
|
||||||
|
transform_paths = glob.glob(os.path.join(self.root_path, '*.json'))
|
||||||
|
transform = None
|
||||||
|
for transform_path in transform_paths:
|
||||||
|
with open(transform_path, 'r') as f:
|
||||||
|
tmp_transform = json.load(f)
|
||||||
|
if transform is None:
|
||||||
|
transform = tmp_transform
|
||||||
|
else:
|
||||||
|
transform['frames'].extend(tmp_transform['frames'])
|
||||||
|
# load train and val split
|
||||||
|
elif type == 'trainval':
|
||||||
|
with open(os.path.join(self.root_path, f'transforms_train.json'), 'r') as f:
|
||||||
|
transform = json.load(f)
|
||||||
|
with open(os.path.join(self.root_path, f'transforms_val.json'), 'r') as f:
|
||||||
|
transform_val = json.load(f)
|
||||||
|
transform['frames'].extend(transform_val['frames'])
|
||||||
|
# only load one specified split
|
||||||
|
else:
|
||||||
|
# no test, use val as test
|
||||||
|
_split = 'val' if type == 'test' else type
|
||||||
|
with open(os.path.join(self.root_path, f'transforms_{_split}.json'), 'r') as f:
|
||||||
|
transform = json.load(f)
|
||||||
|
|
||||||
# load image size
|
# load image size
|
||||||
if 'h' in transform and 'w' in transform:
|
if 'h' in transform and 'w' in transform:
|
||||||
|
@ -371,6 +434,10 @@ class NeRFDataset:
|
||||||
aud_features = np.load(os.path.join(self.root_path, 'aud_eo.npy'))
|
aud_features = np.load(os.path.join(self.root_path, 'aud_eo.npy'))
|
||||||
elif 'deepspeech' in self.opt.asr_model:
|
elif 'deepspeech' in self.opt.asr_model:
|
||||||
aud_features = np.load(os.path.join(self.root_path, 'aud_ds.npy'))
|
aud_features = np.load(os.path.join(self.root_path, 'aud_ds.npy'))
|
||||||
|
# elif 'hubert_cn' in self.opt.asr_model:
|
||||||
|
# aud_features = np.load(os.path.join(self.root_path, 'aud_hu_cn.npy'))
|
||||||
|
elif 'hubert' in self.opt.asr_model:
|
||||||
|
aud_features = np.load(os.path.join(self.root_path, 'aud_hu.npy'))
|
||||||
else:
|
else:
|
||||||
aud_features = np.load(os.path.join(self.root_path, 'aud.npy'))
|
aud_features = np.load(os.path.join(self.root_path, 'aud.npy'))
|
||||||
# cross-driven extracted features.
|
# cross-driven extracted features.
|
||||||
|
|
Loading…
Reference in New Issue