From 4e355e9ab9087ea5daab7668c3d01cc14ec45ccb Mon Sep 17 00:00:00 2001
From: lipku <lipku@foxmail.com>
Date: Sat, 1 Jun 2024 06:58:02 +0800
Subject: [PATCH] del nouse code

---
 asrreal.py  | 182 ++++++++++++++++------------------------------------
 nerfreal.py | 163 +++++++++++++++++++++-------------------------
 webrtc.py   |   2 +
 3 files changed, 132 insertions(+), 215 deletions(-)

diff --git a/asrreal.py b/asrreal.py
index 75d0f49..f7beb38 100644
--- a/asrreal.py
+++ b/asrreal.py
@@ -14,27 +14,6 @@ from queue import Queue
 from threading import Thread, Event
 from io import BytesIO
 
-
-def _read_frame(stream, exit_event, queue, chunk):
-
-    while True:
-        if exit_event.is_set():
-            print(f'[INFO] read frame thread ends')
-            break
-        frame = stream.read(chunk, exception_on_overflow=False)
-        frame = np.frombuffer(frame, dtype=np.int16).astype(np.float32) / 32767 # [chunk]
-        queue.put(frame)
-
-def _play_frame(stream, exit_event, queue, chunk):
-
-    while True:
-        if exit_event.is_set():
-            print(f'[INFO] play frame thread ends')
-            break
-        frame = queue.get()
-        frame = (frame * 32767).astype(np.int16).tobytes()
-        stream.write(frame, chunk)
-
 class ASR:
     def __init__(self, opt):
 
@@ -76,23 +55,14 @@ class ASR:
         #self.audio_instance = pyaudio.PyAudio()  #not need
 
         # create input stream
-        if self.mode == 'file':  #live mode
-            self.file_stream = self.create_file_stream()
-        else:
-            self.queue = Queue()
-            self.input_stream = BytesIO()
-            self.output_queue = Queue()
-            # start a background process to read frames
-            #self.input_stream = self.audio_instance.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, output=False, frames_per_buffer=self.chunk)
-            #self.queue = Queue()
-            #self.process_read_frame = Thread(target=_read_frame, args=(self.input_stream, self.exit_event, self.queue, self.chunk))
+        self.queue = Queue()
+        self.input_stream = BytesIO()
+        self.output_queue = Queue()
+        # start a background process to read frames
+        #self.input_stream = self.audio_instance.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, output=False, frames_per_buffer=self.chunk)
+        #self.queue = Queue()
+        #self.process_read_frame = Thread(target=_read_frame, args=(self.input_stream, self.exit_event, self.queue, self.chunk))
         
-        # play out the audio too...?
-        if self.play:
-            self.output_stream = self.audio_instance.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=False, output=True, frames_per_buffer=self.chunk)
-            self.output_queue = Queue()
-            self.process_play_frame = Thread(target=_play_frame, args=(self.output_stream, self.exit_event, self.output_queue, self.chunk))
-
         # current location of audio
         self.idx = 0
 
@@ -212,51 +182,43 @@ class ASR:
         #    self.text = self.text + ' ' + text
 
         # will only run once at ternimation
-        if self.terminated:
-            self.text += '\n[END]'
-            print(self.text)
-            if self.opt.asr_save_feats:
-                print(f'[INFO] save all feats for training purpose... ')
-                feats = torch.cat(self.all_feats, dim=0) # [N, C]
-                # print('[INFO] before unfold', feats.shape)
-                window_size = 16
-                padding = window_size // 2
-                feats = feats.view(-1, self.audio_dim).permute(1, 0).contiguous() # [C, M]
-                feats = feats.view(1, self.audio_dim, -1, 1) # [1, C, M, 1]
-                unfold_feats = F.unfold(feats, kernel_size=(window_size, 1), padding=(padding, 0), stride=(2, 1)) # [1, C * window_size, M / 2 + 1]
-                unfold_feats = unfold_feats.view(self.audio_dim, window_size, -1).permute(2, 1, 0).contiguous() # [C, window_size, M / 2 + 1] --> [M / 2 + 1, window_size, C]
-                # print('[INFO] after unfold', unfold_feats.shape)
-                # save to a npy file
-                if 'esperanto' in self.opt.asr_model:
-                    output_path = self.opt.asr_wav.replace('.wav', '_eo.npy')
-                else:
-                    output_path = self.opt.asr_wav.replace('.wav', '.npy')
-                np.save(output_path, unfold_feats.cpu().numpy())
-                print(f"[INFO] saved logits to {output_path}")
+        # if self.terminated:
+        #     self.text += '\n[END]'
+        #     print(self.text)
+        #     if self.opt.asr_save_feats:
+        #         print(f'[INFO] save all feats for training purpose... ')
+        #         feats = torch.cat(self.all_feats, dim=0) # [N, C]
+        #         # print('[INFO] before unfold', feats.shape)
+        #         window_size = 16
+        #         padding = window_size // 2
+        #         feats = feats.view(-1, self.audio_dim).permute(1, 0).contiguous() # [C, M]
+        #         feats = feats.view(1, self.audio_dim, -1, 1) # [1, C, M, 1]
+        #         unfold_feats = F.unfold(feats, kernel_size=(window_size, 1), padding=(padding, 0), stride=(2, 1)) # [1, C * window_size, M / 2 + 1]
+        #         unfold_feats = unfold_feats.view(self.audio_dim, window_size, -1).permute(2, 1, 0).contiguous() # [C, window_size, M / 2 + 1] --> [M / 2 + 1, window_size, C]
+        #         # print('[INFO] after unfold', unfold_feats.shape)
+        #         # save to a npy file
+        #         if 'esperanto' in self.opt.asr_model:
+        #             output_path = self.opt.asr_wav.replace('.wav', '_eo.npy')
+        #         else:
+        #             output_path = self.opt.asr_wav.replace('.wav', '.npy')
+        #         np.save(output_path, unfold_feats.cpu().numpy())
+        #         print(f"[INFO] saved logits to {output_path}")
     
     def __get_audio_frame(self): 
         if self.inwarm: # warm up
             return np.zeros(self.chunk, dtype=np.float32),1
         
-        if self.mode == 'file':
-            if self.idx < self.file_stream.shape[0]:
-                frame = self.file_stream[self.idx: self.idx + self.chunk]
-                self.idx = self.idx + self.chunk
-                return frame,0
-            else:
-                return None,0        
-        else:
-            try:
-                frame = self.queue.get(block=False)
-                type = 0
-                print(f'[INFO] get frame {frame.shape}')
-            except queue.Empty:
-                frame = np.zeros(self.chunk, dtype=np.float32)
-                type = 1
+        try:
+            frame = self.queue.get(block=False)
+            type = 0
+            print(f'[INFO] get frame {frame.shape}')
+        except queue.Empty:
+            frame = np.zeros(self.chunk, dtype=np.float32)
+            type = 1
 
-            self.idx = self.idx + self.chunk
+        self.idx = self.idx + self.chunk
 
-            return frame,type
+        return frame,type
 
         
     def __frame_to_text(self, frame):
@@ -360,10 +322,6 @@ class ASR:
         self.tail = 8
         # attention window...
         self.att_feats = [torch.zeros(self.audio_dim, 16, dtype=torch.float32, device=self.device)] * 4
-
-    def before_push_audio(self):
-        self.__init_queue()
-        self.warm_up()
         
     def run(self):
 
@@ -399,53 +357,6 @@ class ASR:
 
         #self.clear_queue()
 
-    '''
-    def create_file_stream(self):
-    
-        stream, sample_rate = sf.read(self.opt.asr_wav) # [T*sample_rate,] float64
-        stream = stream.astype(np.float32)
-
-        if stream.ndim > 1:
-            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
-            stream = stream[:, 0]
-    
-        if sample_rate != self.sample_rate:
-            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
-            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
-
-        print(f'[INFO] loaded audio stream {self.opt.asr_wav}: {stream.shape}')
-
-        return stream
-
-
-    def create_pyaudio_stream(self):
-
-        import pyaudio
-
-        print(f'[INFO] creating live audio stream ...')
-
-        audio = pyaudio.PyAudio()
-        
-        # get devices
-        info = audio.get_host_api_info_by_index(0)
-        n_devices = info.get('deviceCount')
-
-        for i in range(0, n_devices):
-            if (audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
-                name = audio.get_device_info_by_host_api_device_index(0, i).get('name')
-                print(f'[INFO] choose audio device {name}, id {i}')
-                break
-        
-        # get stream
-        stream = audio.open(input_device_index=i,
-                            format=pyaudio.paInt16,
-                            channels=1,
-                            rate=self.sample_rate,
-                            input=True,
-                            frames_per_buffer=self.chunk)
-        
-        return audio, stream
-    '''
     #####not used function#####################################
     def listen(self):
         # start
@@ -489,6 +400,25 @@ class ASR:
             # live mode: also print the result text.        
             self.text += '\n[END]'
             print(self.text)
+
+def _read_frame(stream, exit_event, queue, chunk):
+    while True:
+        if exit_event.is_set():
+            print(f'[INFO] read frame thread ends')
+            break
+        frame = stream.read(chunk, exception_on_overflow=False)
+        frame = np.frombuffer(frame, dtype=np.int16).astype(np.float32) / 32767 # [chunk]
+        queue.put(frame)
+
+def _play_frame(stream, exit_event, queue, chunk):
+
+    while True:
+        if exit_event.is_set():
+            print(f'[INFO] play frame thread ends')
+            break
+        frame = queue.get()
+        frame = (frame * 32767).astype(np.int16).tobytes()
+        stream.write(frame, chunk)
      #########################################################
 
 if __name__ == '__main__':
diff --git a/nerfreal.py b/nerfreal.py
index f4a71e0..3e78e85 100644
--- a/nerfreal.py
+++ b/nerfreal.py
@@ -26,36 +26,36 @@ class NeRFReal:
         self.data_loader = data_loader
 
         # use dataloader's bg
-        bg_img = data_loader._data.bg_img #.view(1, -1, 3)
-        if self.H != bg_img.shape[0] or self.W != bg_img.shape[1]:
-            bg_img = F.interpolate(bg_img.permute(2, 0, 1).unsqueeze(0).contiguous(), (self.H, self.W), mode='bilinear').squeeze(0).permute(1, 2, 0).contiguous()
-        self.bg_color = bg_img.view(1, -1, 3)
+        #bg_img = data_loader._data.bg_img #.view(1, -1, 3)
+        #if self.H != bg_img.shape[0] or self.W != bg_img.shape[1]:
+        #    bg_img = F.interpolate(bg_img.permute(2, 0, 1).unsqueeze(0).contiguous(), (self.H, self.W), mode='bilinear').squeeze(0).permute(1, 2, 0).contiguous()
+        #self.bg_color = bg_img.view(1, -1, 3)
 
         # audio features (from dataloader, only used in non-playing mode)
-        self.audio_features = data_loader._data.auds # [N, 29, 16]
-        self.audio_idx = 0
+        #self.audio_features = data_loader._data.auds # [N, 29, 16]
+        #self.audio_idx = 0
 
         #self.frame_total_num = data_loader._data.end_index
         #print("frame_total_num:",self.frame_total_num)
 
         # control eye
-        self.eye_area = None if not self.opt.exp_eye else data_loader._data.eye_area.mean().item()
+        #self.eye_area = None if not self.opt.exp_eye else data_loader._data.eye_area.mean().item()
 
         # playing seq from dataloader, or pause.
         self.playing = True #False todo
         self.loader = iter(data_loader)
 
-        self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
-        self.need_update = True # camera moved, should reset accumulation
-        self.spp = 1 # sample per pixel
-        self.mode = 'image' # choose from ['image', 'depth']
+        #self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
+        #self.need_update = True # camera moved, should reset accumulation
+        #self.spp = 1 # sample per pixel
+        #self.mode = 'image' # choose from ['image', 'depth']
 
-        self.dynamic_resolution = False # assert False!
-        self.downscale = 1
-        self.train_steps = 16
+        #self.dynamic_resolution = False # assert False!
+        #self.downscale = 1
+        #self.train_steps = 16
 
-        self.ind_index = 0
-        self.ind_num = trainer.model.individual_codes.shape[0]
+        #self.ind_index = 0
+        #self.ind_num = trainer.model.individual_codes.shape[0]
 
         self.customimg_index = 0
 
@@ -113,8 +113,6 @@ class NeRFReal:
     def push_audio(self,chunk):
         self.asr.push_audio(chunk)   
     
-    def before_push_audio(self):
-        self.asr.before_push_audio()
 
     def mirror_index(self, index):
         size = self.opt.customvideo_imgnum
@@ -125,91 +123,78 @@ class NeRFReal:
         else:
             return size - res - 1   
 
-    def prepare_buffer(self, outputs):
-        if self.mode == 'image':
-            return outputs['image']
-        else:
-            return np.expand_dims(outputs['depth'], -1).repeat(3, -1)
-
     def test_step(self,loop=None,audio_track=None,video_track=None):
         
         #starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
         #starter.record()
 
-        if self.playing:
-            try:
-                data = next(self.loader)
-            except StopIteration:
-                self.loader = iter(self.data_loader)
-                data = next(self.loader)
-            
-            if self.opt.asr:
-                # use the live audio stream
-                data['auds'] = self.asr.get_next_feat()
+        try:
+            data = next(self.loader)
+        except StopIteration:
+            self.loader = iter(self.data_loader)
+            data = next(self.loader)
+        
+        if self.opt.asr:
+            # use the live audio stream
+            data['auds'] = self.asr.get_next_feat()
 
-            audiotype = 0
+        audiotype = 0
+        if self.opt.transport=='rtmp':
+            for _ in range(2):
+                frame,type = self.asr.get_audio_out()
+                audiotype += type
+                #print(f'[INFO] get_audio_out shape ',frame.shape)                
+                self.streamer.stream_frame_audio(frame)
+        else:
+            for _ in range(2):
+                frame,type = self.asr.get_audio_out()
+                audiotype += type
+                frame = (frame * 32767).astype(np.int16)
+                new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
+                new_frame.planes[0].update(frame.tobytes())
+                new_frame.sample_rate=16000
+                # if audio_track._queue.qsize()>10:
+                #     time.sleep(0.1)
+                asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)  
+        #t = time.time()
+        if self.opt.customvideo and audiotype!=0:
+            self.loader = iter(self.data_loader) #init
+            imgindex  = self.mirror_index(self.customimg_index)
+            #print('custom img index:',imgindex)
+            image = cv2.imread(os.path.join(self.opt.customvideo_img, str(int(imgindex))+'.png'))
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             if self.opt.transport=='rtmp':
-                for _ in range(2):
-                    frame,type = self.asr.get_audio_out()
-                    audiotype += type
-                    #print(f'[INFO] get_audio_out shape ',frame.shape)                
-                    self.streamer.stream_frame_audio(frame)
+                self.streamer.stream_frame(image)
             else:
-                for _ in range(2):
-                    frame,type = self.asr.get_audio_out()
-                    audiotype += type
-                    frame = (frame * 32767).astype(np.int16)
-                    new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
-                    new_frame.planes[0].update(frame.tobytes())
-                    new_frame.sample_rate=16000
-                    # if audio_track._queue.qsize()>10:
-                    #     time.sleep(0.1)
-                    asyncio.run_coroutine_threadsafe(audio_track._queue.put(new_frame), loop)  
-            #t = time.time()
-            if self.opt.customvideo and audiotype!=0:
-                self.loader = iter(self.data_loader) #init
-                imgindex  = self.mirror_index(self.customimg_index)
-                #print('custom img index:',imgindex)
-                image = cv2.imread(os.path.join(self.opt.customvideo_img, str(int(imgindex))+'.png'))
-                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                new_frame = VideoFrame.from_ndarray(image, format="rgb24")
+                asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
+            self.customimg_index += 1
+        else:
+            self.customimg_index = 0
+            outputs = self.trainer.test_gui_with_data(data, self.W, self.H)
+            #print('-------ernerf time: ',time.time()-t)
+            #print(f'[INFO] outputs shape ',outputs['image'].shape)
+            image = (outputs['image'] * 255).astype(np.uint8)
+            if not self.opt.fullbody:
                 if self.opt.transport=='rtmp':
                     self.streamer.stream_frame(image)
                 else:
                     new_frame = VideoFrame.from_ndarray(image, format="rgb24")
                     asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
-                self.customimg_index += 1
-            else:
-                self.customimg_index = 0
-                outputs = self.trainer.test_gui_with_data(data, self.W, self.H)
-                #print('-------ernerf time: ',time.time()-t)
-                #print(f'[INFO] outputs shape ',outputs['image'].shape)
-                image = (outputs['image'] * 255).astype(np.uint8)
-                if not self.opt.fullbody:
-                    if self.opt.transport=='rtmp':
-                        self.streamer.stream_frame(image)
-                    else:
-                        new_frame = VideoFrame.from_ndarray(image, format="rgb24")
-                        asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
-                else: #fullbody human
-                    #print("frame index:",data['index'])
-                    image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
-                    image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
-                    start_x = self.opt.fullbody_offset_x  # 合并后小图片的起始x坐标
-                    start_y = self.opt.fullbody_offset_y  # 合并后小图片的起始y坐标
-                    image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
-                    if self.opt.transport=='rtmp':
-                        self.streamer.stream_frame(image_fullbody)
-                    else:
-                        new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24")
-                        asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
+            else: #fullbody human
+                #print("frame index:",data['index'])
+                image_fullbody = cv2.imread(os.path.join(self.opt.fullbody_img, str(data['index'][0])+'.jpg'))
+                image_fullbody = cv2.cvtColor(image_fullbody, cv2.COLOR_BGR2RGB)
+                start_x = self.opt.fullbody_offset_x  # 合并后小图片的起始x坐标
+                start_y = self.opt.fullbody_offset_y  # 合并后小图片的起始y坐标
+                image_fullbody[start_y:start_y+image.shape[0], start_x:start_x+image.shape[1]] = image
+                if self.opt.transport=='rtmp':
+                    self.streamer.stream_frame(image_fullbody)
+                else:
+                    new_frame = VideoFrame.from_ndarray(image_fullbody, format="rgb24")
+                    asyncio.run_coroutine_threadsafe(video_track._queue.put(new_frame), loop)
             #self.pipe.stdin.write(image.tostring())        
-        else:
-            if self.audio_features is not None:
-                auds = get_audio_features(self.audio_features, self.opt.att, self.audio_idx)
-            else:
-                auds = None
-            outputs = self.trainer.test_gui(self.cam.pose, self.cam.intrinsics, self.W, self.H, auds, self.eye_area, self.ind_index, self.bg_color, self.spp, self.downscale)
-
+       
         #ender.record()
         #torch.cuda.synchronize()
         #t = starter.elapsed_time(ender)
diff --git a/webrtc.py b/webrtc.py
index 9fc94d1..ca40f73 100644
--- a/webrtc.py
+++ b/webrtc.py
@@ -60,6 +60,7 @@ class PlayerStreamTrack(MediaStreamTrack):
             else:
                 self._start = time.time()
                 self._timestamp = 0
+                print('video start:',self._start)
             return self._timestamp, VIDEO_TIME_BASE
         else: #audio
             if hasattr(self, "_timestamp"):
@@ -71,6 +72,7 @@ class PlayerStreamTrack(MediaStreamTrack):
             else:
                 self._start = time.time()
                 self._timestamp = 0
+                print('audio start:',self._start)
             return self._timestamp, AUDIO_TIME_BASE
 
     async def recv(self) -> Union[Frame, Packet]: