Adding support for audio data recording and broadcasting for LeKiwi

2025-04-09 17:53:08 +02:00 · 2025-04-09 17:53:08 +02:00 · ec8943db37
parent 1e5e631743
commit ec8943db37
8 changed files with 123 additions and 20 deletions
--- a/lerobot/common/datasets/compute_stats.py
+++ b/lerobot/common/datasets/compute_stats.py
@ -15,7 +15,7 @@
 # limitations under the License.
 import numpy as np

-from lerobot.common.datasets.utils import load_image_as_numpy, load_audio
+from lerobot.common.datasets.utils import load_image_as_numpy, load_audio_from_path

 def estimate_num_samples(
    dataset_len: int, min_num_samples: int = 100, max_num_samples: int = 10_000, power: float = 0.75
@ -70,13 +70,17 @@ def sample_images(image_paths: list[str]) -> np.ndarray:

    return images

-def sample_audio(audio_path: str) -> np.ndarray:
+def sample_audio_from_path(audio_path: str) -> np.ndarray:

-    data = load_audio(audio_path)
+    data = load_audio_from_path(audio_path)
    sampled_indices = sample_indices(len(data))

    return(data[sampled_indices])

+def sample_audio_from_data(data: np.ndarray) -> np.ndarray:
+    sampled_indices = sample_indices(len(data))
+    return data[sampled_indices]
+
 def get_feature_stats(array: np.ndarray, axis: tuple, keepdims: bool) -> dict[str, np.ndarray]:
    return {
        "min": np.min(array, axis=axis, keepdims=keepdims),
@ -97,7 +101,10 @@ def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], featu
            axes_to_reduce = (0, 2, 3)  # keep channel dim
            keepdims = True
        elif features[key]["dtype"] == "audio":
-            ep_ft_array = sample_audio(data[0])
+            try:
+                ep_ft_array = sample_audio_from_path(data[0])
+            except TypeError:   #Should only be triggered for LeKiwi robot
+                ep_ft_array = sample_audio_from_data(data)
            axes_to_reduce = 0 
            keepdims = True
        else:
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@ -80,6 +80,7 @@ from lerobot.common.datasets.video_utils import (
 )
 from lerobot.common.robot_devices.robots.utils import Robot
 from lerobot.common.robot_devices.microphones.utils import Microphone
+import soundfile as sf

 CODEBASE_VERSION = "v2.1"

@ -324,7 +325,7 @@ class LeRobotDatasetMetadata:
        been encoded the same way. Also, this means it assumes the first episode exists.
        """
        for key in set(self.audio_keys) - set(self.audio_camera_keys_mapping.keys()):
-            if not self.features[key].get("info", None):
+            if not self.features[key].get("info", None) or (len(self.features[key]["info"]) == 1 and "sample_rate" in self.features[key]["info"]):
                audio_path = self.root / self.get_compressed_audio_file_path(0, key)
                self.info["features"][key]["info"] = get_audio_info(audio_path)

@ -910,6 +911,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
                self._save_image(frame[key], img_path)
                self.episode_buffer[key].append(str(img_path))
            elif self.features[key]["dtype"] == "audio":
+                if self.meta.robot_type.startswith("lekiwi"):
+                    self.episode_buffer[key].append(frame[key])
+                else:
                    if frame_index == 0:
                        audio_path = self._get_raw_audio_file_path(
                            episode_index=self.episode_buffer["episode_index"], audio_key=key
@ -966,12 +970,23 @@ class LeRobotDataset(torch.utils.data.Dataset):
        for key, ft in self.features.items():
            # index, episode_index, task_index are already processed above, and image and video
            # are processed separately by storing image path and frame info as meta data
-            if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video", "audio"]:
+            if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]:
+                continue
+            elif ft["dtype"] == "audio":
+                if self.meta.robot_type.startswith("lekiwi"):
+                    episode_buffer[key] = np.concatenate(episode_buffer[key], axis=0)
                continue
            episode_buffer[key] = np.stack(episode_buffer[key])

        self._wait_image_writer()
        self._save_episode_table(episode_buffer, episode_index)
+
+        if self.meta.robot_type.startswith("lekiwi"):
+            for key in self.meta.audio_keys:
+                audio_path = self._get_raw_audio_file_path(episode_index=self.episode_buffer["episode_index"][0], audio_key=key)
+                with sf.SoundFile(audio_path, mode='w', samplerate=self.meta.features[key]["info"]["sample_rate"], channels=self.meta.features[key]["shape"][0]) as file:
+                    file.write(episode_buffer[key])
+
        ep_stats = compute_episode_stats(episode_buffer, self.features)

        if len(self.meta.video_keys) > 0:
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@ -260,7 +260,7 @@ def load_image_as_numpy(
        img_array /= 255.0
    return img_array

-def load_audio(fpath: str | Path) -> np.ndarray:
+def load_audio_from_path(fpath: str | Path) -> np.ndarray:
    audio_data, _ = read(fpath, dtype="float32")
    return audio_data

--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@ -252,7 +252,7 @@ def control_loop(
    timestamp = 0
    start_episode_t = time.perf_counter()

-    if dataset is not None:
+    if dataset is not None and not robot.robot_type.startswith("lekiwi"):   #For now, LeKiwi only supports frame audio recording (which may lead to audio chunks loss, extended post-processing, increased memory usage)
        for microphone_key, microphone in robot.microphones.items():
            #Start recording both in file writing and data reading mode
            dataset.add_microphone_recording(microphone, microphone_key)
--- a/lerobot/common/robot_devices/robots/lekiwi_remote.py
+++ b/lerobot/common/robot_devices/robots/lekiwi_remote.py
@ -51,6 +51,14 @@ def run_camera_capture(cameras, images_lock, latest_images_dict, stop_event):
            latest_images_dict.update(local_dict)
        time.sleep(0.01)

+def run_microphone_capture(microphones, audio_lock, latest_audio_dict, stop_event):
+    while not stop_event.is_set():
+        local_dict = {}
+        for name, microphone in microphones.items():
+            audio_readings = microphone.read()
+            local_dict[name] = audio_readings
+        with audio_lock:
+            latest_audio_dict.update(local_dict)

 def calibrate_follower_arm(motors_bus, calib_dir_str):
    """
@ -94,6 +102,7 @@ def run_lekiwi(robot_config):
    """
    # Import helper functions and classes
    from lerobot.common.robot_devices.cameras.utils import make_cameras_from_configs
+    from lerobot.common.robot_devices.microphones.utils import make_microphones_from_configs
    from lerobot.common.robot_devices.motors.feetech import FeetechMotorsBus, TorqueMode

    # Initialize cameras from the robot configuration.
@ -101,6 +110,11 @@ def run_lekiwi(robot_config):
    for cam in cameras.values():
        cam.connect()

+    # Initialize microphones from the robot configuration.
+    microphones = make_microphones_from_configs(robot_config.microphones)
+    for microphone in microphones.values():
+        microphone.connect()
+
    # Initialize the motors bus using the follower arm configuration.
    motor_config = robot_config.follower_arms.get("main")
    if motor_config is None:
@ -134,6 +148,18 @@ def run_lekiwi(robot_config):
    )
    cam_thread.start()

+    # Start the microphone recording and capture thread.
+    #TODO(CarolinePascal) : Leverage multi-core processing with a multiprocessing.Process instead !
+    latest_audio_dict = {}
+    audio_lock = threading.Lock()
+    audio_stop_event = threading.Event()
+    microphone_thread = threading.Thread(
+        target=run_microphone_capture, args=(microphones, audio_lock, latest_audio_dict, audio_stop_event), daemon=True
+    )
+    for microphone in microphones.values():
+        microphone.start_recording()
+    microphone_thread.start()
+
    last_cmd_time = time.time()
    print("LeKiwi robot server started. Waiting for commands...")

@ -198,9 +224,14 @@ def run_lekiwi(robot_config):
            with images_lock:
                images_dict_copy = dict(latest_images_dict)

+            # Get the latest audio data.
+            with audio_lock:
+                audio_dict_copy = dict(latest_audio_dict)
+
            # Build the observation dictionary.
            observation = {
                "images": images_dict_copy,
+                "audio": audio_dict_copy,   #TODO(CarolinePascal) : This is a nasty way to do it, sorry.
                "present_speed": current_velocity,
                "follower_arm_state": follower_arm_state,
            }
@ -217,6 +248,9 @@ def run_lekiwi(robot_config):
    finally:
        stop_event.set()
        cam_thread.join()
+        microphone_thread.join()
+        for microphone in microphones.values():
+            microphone.stop_recording()
        robot.stop()
        motors_bus.disconnect()
        cmd_socket.close()
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@ -211,7 +211,7 @@ class ManipulatorRobot:
                "dtype": "audio",
                "shape": (len(mic.channels),),
                "names": "channels",
-                "info" : None,
+                "info" : {"sample_rate": mic.sample_rate},
            }
        return mic_ft

--- a/lerobot/common/robot_devices/robots/mobile_manipulator.py
+++ b/lerobot/common/robot_devices/robots/mobile_manipulator.py
@ -24,6 +24,7 @@ import torch
 import zmq

 from lerobot.common.robot_devices.cameras.utils import make_cameras_from_configs
+from lerobot.common.robot_devices.microphones.utils import make_microphones_from_configs
 from lerobot.common.robot_devices.motors.feetech import TorqueMode
 from lerobot.common.robot_devices.motors.utils import MotorsBus, make_motors_buses_from_configs
 from lerobot.common.robot_devices.robots.configs import LeKiwiRobotConfig
@ -79,6 +80,7 @@ class MobileManipulator:
        self.follower_arms = make_motors_buses_from_configs(self.config.follower_arms)

        self.cameras = make_cameras_from_configs(self.config.cameras)
+        self.microphones = make_microphones_from_configs(self.config.microphones)

        self.is_connected = False

@ -133,6 +135,7 @@ class MobileManipulator:
                "shape": (cam.height, cam.width, cam.channels),
                "names": ["height", "width", "channels"],
                "info": None,
+                "audio": "observation.audio." + cam.microphone if cam.microphone is not None else None,
            }
        return cam_ft

@ -161,9 +164,22 @@ class MobileManipulator:
            },
        }
    
+    @property
+    def microphone_features(self) -> dict:
+        mic_ft = {}
+        for mic_key, mic in self.microphones.items():
+            key = f"observation.audio.{mic_key}"
+            mic_ft[key] = {
+                "dtype": "audio",
+                "shape": (len(mic.channels),),
+                "names": "channels",
+                "info" : {"sample_rate": mic.sample_rate},
+            }
+        return mic_ft
+
    @property
    def features(self):
-        return {**self.motor_features, **self.camera_features}
+        return {**self.motor_features, **self.camera_features, **self.microphone_features}

    @property
    def has_camera(self):
@ -173,6 +189,14 @@ class MobileManipulator:
    def num_cameras(self):
        return len(self.cameras)
    
+    @property
+    def has_microphone(self):
+        return len(self.microphones) > 0
+    
+    @property
+    def num_microphones(self):
+        return len(self.microphones)
+
    @property
    def available_arms(self):
        available = []
@ -344,6 +368,7 @@ class MobileManipulator:
            observation = json.loads(last_msg)

            images_dict = observation.get("images", {})
+            audio_dict = observation.get("audio", {})
            new_speed = observation.get("present_speed", {})
            new_arm_state = observation.get("follower_arm_state", None)

@ -356,6 +381,11 @@ class MobileManipulator:
                    if frame_candidate is not None:
                        frames[cam_name] = frame_candidate

+            # Recieve audio
+            for microphone_name, audio_data in audio_dict.items():
+                if audio_data:
+                    frames[microphone_name] = audio_data
+
            # If remote_arm_state is None and frames is None there is no message then use the previous message
            if new_arm_state is not None and frames is not None:
                self.last_frames = frames
@ -475,6 +505,14 @@ class MobileManipulator:
                frame = np.zeros((cam.height, cam.width, cam.channels), dtype=np.uint8)
            obs_dict[f"observation.images.{cam_name}"] = torch.from_numpy(frame)

+        # Loop over each configured microphone
+        for microphone_name, microphone in self.microphones.items():
+            frame = frames.get(microphone_name, None)
+            if frame is None:
+                # Create silence using the microphone's configured channels
+                frame = np.zeros((1, len(microphone.channels)), dtype=np.float32)
+            obs_dict[f"observation.audio.{microphone_name}"] = torch.from_numpy(frame)
+            
        return obs_dict

    def send_action(self, action: torch.Tensor) -> torch.Tensor:
--- a/tests/datasets/test_compute_stats.py
+++ b/tests/datasets/test_compute_stats.py
@ -26,7 +26,8 @@ from lerobot.common.datasets.compute_stats import (
    estimate_num_samples,
    get_feature_stats,
    sample_images,
-    sample_audio,
+    sample_audio_from_path,
+    sample_audio_from_data,
    sample_indices,
 )

@ -73,10 +74,18 @@ def test_sample_images(mock_load):
    assert images.dtype == np.uint8
    assert len(images) == estimate_num_samples(100)

-@patch("lerobot.common.datasets.compute_stats.load_audio", side_effect=mock_load_audio)
-def test_sample_audio(mock_load):
+@patch("lerobot.common.datasets.compute_stats.load_audio_from_path", side_effect=mock_load_audio)
+def test_sample_audio_from_path(mock_load):
    audio_path = "audio.wav"
-    audio_samples = sample_audio(audio_path)
+    audio_samples = sample_audio_from_path(audio_path)
+    assert isinstance(audio_samples, np.ndarray)
+    assert audio_samples.shape[1] == 2
+    assert audio_samples.dtype == np.float32
+    assert len(audio_samples) == estimate_num_samples(16000)
+
+def test_sample_audio_from_data(mock_load):
+    audio_data = np.ones((16000, 2), dtype=np.float32)
+    audio_samples = sample_audio_from_data(audio_data)
    assert isinstance(audio_samples, np.ndarray)
    assert audio_samples.shape[1] == 2
    assert audio_samples.dtype == np.float32
@ -166,7 +175,7 @@ def test_compute_episode_stats():
    with patch(
        "lerobot.common.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy
    ), patch(
-        "lerobot.common.datasets.compute_stats.load_audio", side_effect=mock_load_audio
+        "lerobot.common.datasets.compute_stats.load_audio_from_path", side_effect=mock_load_audio
    ):
        stats = compute_episode_stats(episode_data, features)