improve fullbody

2024-03-31 12:01:28 +08:00 · 2024-03-31 12:01:28 +08:00 · 250cbaa587
parent ace4495631
commit 250cbaa587
4 changed files with 92 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -3,6 +3,12 @@ A streaming digital human based on the Ernerf model， realize audio video synch

 [![Watch the video]](/assets/demo.mp4)

+## Features
+1. 支持声音克隆
+2. 支持大模型对话
+3. 支持多种音频特征驱动：wav2vec、hubert
+4. 支持全身视频拼接
+
 ## 1. Installation

 Tested on Ubuntu 20.04, Python3.10, Pytorch 1.12 and CUDA 11.3
@ -53,7 +59,7 @@ nginx

 用浏览器打开http://serverip/echo.html, 在文本框输入任意文字，提交。数字人播报该段文字  

-## 3. 更多使用
+## 3. More Usage
 ### 3.1 使用LLM模型进行数字人对话

 目前借鉴数字人对话系统[LinlyTalker](https://github.com/Kedreamix/Linly-Talker)的方式，LLM模型支持Chatgpt,Qwen和GeminiPro。需要在app.py中填入自己的api_key。  
@ -98,6 +104,7 @@ python app.py --fullbody --fullbody_img data/fullbody/img --fullbody_offset_x 10
 ```
 - --fullbody_width、--fullbody_height 全身视频的宽、高
 - --W、--H 训练视频的宽、高  
+- ernerf训练第三步torso如果训练的不好，在拼接处会有接缝。可以在上面的命令加上--torso_imgs data/xxx/torso_imgs，torso不用模型推理，直接用训练数据集里的torso图片。这种方式可能头颈处会有些人工痕迹。
  
 ## 4. Docker Run  
 不需要第1步的安装，直接运行。
--- a/app.py
+++ b/app.py
@ -159,6 +159,7 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source")
    parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area")
+    parser.add_argument('--torso_imgs', type=str, default="", help="torso images path")

    parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --exp_eye")

@ -296,6 +297,7 @@ if __name__ == '__main__':
    opt.exp_eye = True
    opt.smooth_eye = True

+    if opt.torso_imgs=='': #no img,use model output
        opt.torso = True

    # assert opt.cuda_ray, "Only support CUDA ray mode."
@ -305,6 +307,7 @@ if __name__ == '__main__':
        # assert opt.patch_size > 16, "patch_size should > 16 to run LPIPS loss."
        assert opt.num_rays % (opt.patch_size ** 2) == 0, "patch_size ** 2 should be dividable by num_rays."
    seed_everything(opt.seed)
+    print(opt)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = NeRFNetwork(opt)
--- a/main.py
+++ b/main.py
@ -1,7 +1,7 @@
 import torch
 import argparse

-from nerf_triplane.provider import NeRFDataset
+from nerf_triplane.provider import NeRFDataset,NeRFDataset_Test
 from nerf_triplane.utils import *
 from nerf_triplane.network import NeRFNetwork

@ -24,6 +24,9 @@ if __name__ == '__main__':
    parser.add_argument('--workspace', type=str, default='workspace')
    parser.add_argument('--seed', type=int, default=0)

+    parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source")
+    parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area")
+
    ### training options
    parser.add_argument('--iters', type=int, default=200000, help="training iters")
    parser.add_argument('--lr', type=float, default=1e-2, help="initial learning rate")
@ -47,7 +50,7 @@ if __name__ == '__main__':
    ### network backbone options
    parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
    
-    parser.add_argument('--bg_img', type=str, default='', help="background image")
+    parser.add_argument('--bg_img', type=str, default='white', help="background image")
    parser.add_argument('--fbg', action='store_true', help="frame-wise bg")
    parser.add_argument('--exp_eye', action='store_true', help="explicitly control the eyes")
    parser.add_argument('--fix_eye', type=float, default=-1, help="fixed eye area, negative to disable, set to 0-0.3 for a reasonable eye")
--- a/nerf_triplane/provider.py
+++ b/nerf_triplane/provider.py
@ -98,6 +98,7 @@ class NeRFDataset_Test:

        self.training = False
        self.num_rays = -1
+        self.preload = opt.preload # 0 = disk, 1 = cpu, 2 = gpu

        # load nerf-compatible format data.
        
@ -148,6 +149,7 @@ class NeRFDataset_Test:
        self.poses = []
        self.auds = []
        self.eye_area = []
+        self.torso_img = []

        for f in tqdm.tqdm(frames, desc=f'Loading data'):
            
@ -173,6 +175,29 @@ class NeRFDataset_Test:
                
                self.eye_area.append(area)
            
+            # load frame-wise bg
+        
+            if self.opt.torso_imgs!='':
+                torso_img_path = os.path.join(self.opt.torso_imgs, str(f['img_id']) + '.png')
+
+                if self.preload > 0:
+                    torso_img = cv2.imread(torso_img_path, cv2.IMREAD_UNCHANGED) # [H, W, 4]
+                    torso_img = cv2.cvtColor(torso_img, cv2.COLOR_BGRA2RGBA)
+                    torso_img = torso_img.astype(np.float32) / 255 # [H, W, 3/4]
+
+                    self.torso_img.append(torso_img)
+                else:
+                    self.torso_img.append(torso_img_path)
+        
+        if self.opt.torso_imgs!='':
+            if self.preload > 0:
+                self.torso_img = torch.from_numpy(np.stack(self.torso_img, axis=0)) # [N, H, W, C]
+            else:
+                self.torso_img = np.array(self.torso_img)
+            if self.preload > 1:  #gpu
+                self.torso_img = self.torso_img.to(torch.half).to(self.device)
+            
+        
        # load pre-extracted background image (should be the same size as training image...)

        if self.opt.bg_img == 'white': # special
@ -209,6 +234,9 @@ class NeRFDataset_Test:
        
        self.bg_img = torch.from_numpy(self.bg_img)

+        if self.preload > 1 or self.opt.torso_imgs=='':  #gpu
+            self.bg_img = self.bg_img.to(torch.half).to(self.device)
+
        if self.opt.exp_eye:
            self.eye_area = np.array(self.eye_area, dtype=np.float32) # [N]
            print(f'[INFO] eye_area: {self.eye_area.min()} - {self.eye_area.max()}')
@ -230,8 +258,6 @@ class NeRFDataset_Test:
        if self.auds is not None:
            self.auds = self.auds.to(self.device)
        
-        self.bg_img = self.bg_img.to(torch.half).to(self.device)
-        
        if self.opt.exp_eye:
            self.eye_area = self.eye_area.to(self.device)

@ -286,6 +312,21 @@ class NeRFDataset_Test:
        else:
            results['eye'] = None
        
+        # load bg
+        if self.opt.torso_imgs!='':
+            bg_torso_img = self.torso_img[index]
+            if self.preload == 0: # on the fly loading
+                bg_torso_img = cv2.imread(bg_torso_img[0], cv2.IMREAD_UNCHANGED) # [H, W, 4]
+                bg_torso_img = cv2.cvtColor(bg_torso_img, cv2.COLOR_BGRA2RGBA)
+                bg_torso_img = bg_torso_img.astype(np.float32) / 255 # [H, W, 3/4]
+                bg_torso_img = torch.from_numpy(bg_torso_img).unsqueeze(0)
+            bg_torso_img = bg_torso_img[..., :3] * bg_torso_img[..., 3:] + self.bg_img * (1 - bg_torso_img[..., 3:])
+            bg_torso_img = bg_torso_img.view(B, -1, 3).to(self.device)
+            if not self.opt.torso:
+                bg_img = bg_torso_img
+            else:
+                bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device)
+        else:
            bg_img = self.bg_img.view(1, -1, 3).repeat(B, 1, 1).to(self.device)

        results['bg_color'] = bg_img
@ -341,7 +382,29 @@ class NeRFDataset:

        # load nerf-compatible format data.
      
-        with open(opt.pose, 'r') as f:
+         # load all splits (train/valid/test)
+        if type == 'all':
+            transform_paths = glob.glob(os.path.join(self.root_path, '*.json'))
+            transform = None
+            for transform_path in transform_paths:
+                with open(transform_path, 'r') as f:
+                    tmp_transform = json.load(f)
+                    if transform is None:
+                        transform = tmp_transform
+                    else:
+                        transform['frames'].extend(tmp_transform['frames'])
+        # load train and val split
+        elif type == 'trainval':
+            with open(os.path.join(self.root_path, f'transforms_train.json'), 'r') as f:
+                transform = json.load(f)
+            with open(os.path.join(self.root_path, f'transforms_val.json'), 'r') as f:
+                transform_val = json.load(f)
+            transform['frames'].extend(transform_val['frames'])
+        # only load one specified split
+        else:
+            # no test, use val as test
+            _split = 'val' if type == 'test' else type
+            with open(os.path.join(self.root_path, f'transforms_{_split}.json'), 'r') as f:
                transform = json.load(f)

        # load image size
@ -371,6 +434,10 @@ class NeRFDataset:
                    aud_features = np.load(os.path.join(self.root_path, 'aud_eo.npy'))
                elif 'deepspeech' in self.opt.asr_model:
                    aud_features = np.load(os.path.join(self.root_path, 'aud_ds.npy'))
+                # elif 'hubert_cn' in self.opt.asr_model:
+                #     aud_features = np.load(os.path.join(self.root_path, 'aud_hu_cn.npy'))
+                elif 'hubert' in self.opt.asr_model:
+                    aud_features = np.load(os.path.join(self.root_path, 'aud_hu.npy'))
                else:
                    aud_features = np.load(os.path.join(self.root_path, 'aud.npy'))
            # cross-driven extracted features.