Adding audio frames reading capability

2025-04-03 11:41:05 +02:00 · 2025-04-03 11:41:05 +02:00 · 058478a74d
parent e4eebd0680
commit 058478a74d
3 changed files with 111 additions and 37 deletions
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@ -249,9 +249,14 @@ def control_loop(
    timestamp = 0
    start_episode_t = time.perf_counter()
-    if teleoperate and dataset is not None:
+    if dataset is not None:
        for microphone_key, microphone in robot.microphones.items():
            #Start recording both in file writing and data reading mode
            dataset.add_microphone_recording(microphone, microphone_key)
    else:
        for _, microphone in robot.microphones.items():
            # Start recording only in data reading mode
            microphone.start_recording()
    while timestamp < control_time_s:
        start_loop_t = time.perf_counter()
@ -271,7 +276,9 @@ def control_loop(
                action = {"action": action}
        if dataset is not None:
-            frame = {**observation, **action, "task": single_task}
+            #Remove audio frames which are directly written in a dedicated file
            audioless_observation = {key: observation[key] for key in observation if key not in robot.microphones}
            frame = {**audioless_observation, **action, "task": single_task}
            dataset.add_frame(frame)
        # TODO(Steven): This should be more general (for RemoteRobot instead of checking the name, but anyways it will change soon)
--- a/lerobot/common/robot_devices/microphones/microphone.py
+++ b/lerobot/common/robot_devices/microphones/microphone.py
@ -119,16 +119,13 @@ class Microphone:
    config = MicrophoneConfig(microphone_index=0, sampling_rate=16000, channels=[1], data_type="int16")
    microphone = Microphone(config)
    microphone.connect()
    microphone.start_recording("some/output/file.wav")
    ...
-    microphone.stop_recording()
+    audio_readings = microphone.read()  #Gets all recorded audio data since the last read or since the beginning of the recording
    #OR
    microphone.start_recording()
    ...
    microphone.stop_recording()
-    last_recorded_audio_chunk = microphone.queue.get()
+    microphone.disconnect()
    ```
    """
@ -145,12 +142,16 @@ class Microphone:
        #Input audio stream
        self.stream = None
        #Thread-safe concurrent queue to store the recorded audio
        self.queue = Queue()
        self.thread = None
        self.stop_event = None
        self.logs = {}
        #Thread-safe concurrent queue to store the recorded/read audio
        self.record_queue = Queue()
        self.read_queue = Queue()
        #Thread to handle data reading and file writing in a separate thread (safely)
        self.record_thread = None
        self.record_stop_event = None
        self.logs = {}
        self.is_connected = False
    def connect(self) -> None:
@ -213,53 +214,101 @@ class Microphone:
    def _audio_callback(self, indata, frames, time, status) -> None :
        if status:
            logging.warning(status)
-        #slicing makes copy unecessary 
+        # Slicing makes copy unecessary 
-        self.queue.put(indata[:,self.channels])
+        # Two separate queues are necessary because .get() also pops the data from the queue
        self.record_queue.put(indata[:,self.channels])
        self.read_queue.put(indata[:,self.channels])
-    def _read_write_loop(self, output_file : Path) -> None:
+    def _record_loop(self, output_file: Path) -> None:
        output_file = Path(output_file)
        if output_file.exists():
            shutil.rmtree(
                output_file,
            )
        with sf.SoundFile(output_file, mode='x', samplerate=self.sampling_rate,
                      channels=max(self.channels)+1, subtype=sf.default_subtype(output_file.suffix[1:])) as file:
-            while not self.stop_event.is_set():
+            while not self.record_stop_event.is_set():
-                file.write(self.queue.get())
+                file.write(self.record_queue.get())
                #self.record_queue.task_done()
    def _read(self) -> np.ndarray:
        """
        Gets audio data from the queue and coverts it to a numpy array.
        -> PROS : Inherently thread safe, no need to lock the queue, lightweight CPU usage
        -> CONS : Reading duration does not scale well with the number of channels and reading duration
        """
        try:
            audio_readings = self.read_queue.queue
        except Queue.Empty:
            audio_readings = np.empty((0, len(self.channels)))
        else:
            #TODO(CarolinePascal): Check if this is the fastest way to do it
            self.read_queue = Queue()
            with self.read_queue.mutex:
                self.read_queue.queue.clear()
                #self.read_queue.all_tasks_done.notify_all()
            audio_readings = np.array(audio_readings).reshape(-1, len(self.channels))
        return audio_readings
    def read(self) -> np.ndarray:
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
        if not self.stream.active:
            raise RuntimeError(f"Microphone {self.microphone_index} is not recording.")
        start_time = time.perf_counter()
        audio_readings = self._read()
        # log the number of seconds it took to read the audio chunk
        self.logs["delta_timestamp_s"] = time.perf_counter() - start_time
        # log the utc time at which the audio chunk was received
        self.logs["timestamp_utc"] = capture_timestamp_utc()
        return audio_readings
    def start_recording(self, output_file : str | None = None) -> None: 
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
        self.read_queue = Queue()
        with self.read_queue.mutex:
            self.read_queue.queue.clear()
            #self.read_queue.all_tasks_done.notify_all()
        self.record_queue = Queue()
        with self.record_queue.mutex:
            self.record_queue.queue.clear()
            #self.record_queue.all_tasks_done.notify_all()
        #Recording case
        if output_file is not None:
-            self.stop_event = Event()
+            output_file = Path(output_file)
-            self.thread = Thread(target=self._read_write_loop, args=(output_file,))
+            if output_file.exists():
-            self.thread.daemon = True
+                output_file.unlink()
-            self.thread.start()
+
            self.record_stop_event = Event()
            self.record_thread = Thread(target=self._record_loop, args=(output_file,))
            self.record_thread.daemon = True
            self.record_thread.start()
        self.logs["start_timestamp"] = capture_timestamp_utc()
        self.stream.start()
    def stop_recording(self) -> None:
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
        self.logs["stop_timestamp"] = capture_timestamp_utc()
-        if self.thread is not None:
+        if self.record_thread is not None:
-            self.stop_event.set()
+            #self.record_queue.join()
-            self.thread.join()
+            self.record_stop_event.set()
-            self.thread = None
+            self.record_thread.join()
-            self.stop_event = None
+            self.record_thread = None
            self.record_stop_event = None
        if self.stream.active:
            self.stream.stop()  #Wait for all buffers to be processed
            #Remark : stream.abort() flushes the buffers !
        self.logs["duration"] = self.logs["stop_timestamp"] - self.logs["start_timestamp"]
    def disconnect(self) -> None:
        if not self.is_connected:
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@ -541,6 +541,15 @@ class ManipulatorRobot:
            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
        # Capture audio from microphones
        audio = {}
        for name in self.microphones:
            before_audioread_t = time.perf_counter()
            audio[name] = self.microphones[name].read()
            audio[name] = torch.from_numpy(audio[name])
            self.logs[f"read_microphone_{name}_dt_s"] = self.microphones[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_microphone_{name}_dt_s"] = time.perf_counter() - before_audioread_t
        # Populate output dictionaries
        obs_dict, action_dict = {}, {}
        obs_dict["observation.state"] = state
@ -581,6 +590,15 @@ class ManipulatorRobot:
            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
        # Capture audio from microphones
        audio = {}
        for name in self.microphones:
            before_audioread_t = time.perf_counter()
            audio[name] = self.microphones[name].read()
            audio[name] = torch.from_numpy(audio[name])
            self.logs[f"read_microphone_{name}_dt_s"] = self.microphones[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_microphone_{name}_dt_s"] = time.perf_counter() - before_audioread_t
        # Populate output dictionaries and format to pytorch
        obs_dict = {}
        obs_dict["observation.state"] = state