Merge ca716ed196 into 768e36660d

2025-04-15 15:41:07 +00:00 · 2025-04-15 15:41:07 +00:00 · 35d58527a4
parent 768e36660d ca716ed196
commit 35d58527a4
25 changed files with 1610 additions and 94 deletions
--- a/lerobot/init.py
+++ b/lerobot/init.py
@ -190,6 +190,11 @@ available_cameras = [
    "intelrealsense",
 ]
 # lists all available microphones from `lerobot/common/robot_devices/microphones`
 available_microphones = [
    "microphone",
 ]
 # lists all available motors from `lerobot/common/robot_devices/motors`
 available_motors = [
    "dynamixel",
--- a/lerobot/common/datasets/audio_utils.py
+++ b/lerobot/common/datasets/audio_utils.py
@ -0,0 +1,165 @@
 #!/usr/bin/env python
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
 import logging
 import subprocess
 from collections import OrderedDict
 from pathlib import Path
 import torch
 import torchaudio
 from numpy import ceil
 def decode_audio(
    audio_path: Path | str,
    timestamps: list[float],
    duration: float,
    backend: str | None = "ffmpeg",
 ) -> torch.Tensor:
    """
    Decodes audio using the specified backend.
    Args:
        audio_path (Path): Path to the audio file.
        timestamps (list[float]): List of (starting) timestamps to extract audio chunks.
        duration (float): Duration of the audio chunks in seconds.
        backend (str, optional): Backend to use for decoding. Defaults to "ffmpeg".
    Returns:
        torch.Tensor: Decoded audio chunks.
    Currently supports ffmpeg.
    """
    if backend == "torchcodec":
        raise NotImplementedError("torchcodec is not yet supported for audio decoding")
    elif backend == "ffmpeg":
        return decode_audio_torchaudio(audio_path, timestamps, duration)
    else:
        raise ValueError(f"Unsupported video backend: {backend}")
 def decode_audio_torchaudio(
    audio_path: Path | str,
    timestamps: list[float],
    duration: float,
    log_loaded_timestamps: bool = False,
 ) -> torch.Tensor:
    # TODO(CarolinePascal) : add channels selection
    audio_path = str(audio_path)
    reader = torchaudio.io.StreamReader(src=audio_path)
    audio_sample_rate = reader.get_src_stream_info(reader.default_audio_stream).sample_rate
    # TODO(CarolinePascal) : sort timestamps ?
    reader.add_basic_audio_stream(
        frames_per_chunk=int(ceil(duration * audio_sample_rate)),  # Too much is better than not enough
        buffer_chunk_size=-1,  # No dropping frames
        format="fltp",  # Format as float32
    )
    audio_chunks = []
    for ts in timestamps:
        reader.seek(ts)  # Default to closest audio sample
        status = reader.fill_buffer()
        if status != 0:
            logging.warning("Audio stream reached end of recording before decoding desired timestamps.")
        current_audio_chunk = reader.pop_chunks()[0]
        if log_loaded_timestamps:
            logging.info(
                f"audio chunk loaded at starting timestamp={current_audio_chunk['pts']:.4f} with duration={len(current_audio_chunk) / audio_sample_rate:.4f}"
            )
        audio_chunks.append(current_audio_chunk)
    audio_chunks = torch.stack(audio_chunks)
    assert len(timestamps) == len(audio_chunks)
    return audio_chunks
 def encode_audio(
    input_path: Path | str,
    output_path: Path | str,
    codec: str = "aac",  # TODO(CarolinePascal) : investigate Fraunhofer FDK AAC (libfdk_aac) codec and and constant (file size control) /variable (quality control) bitrate options
    log_level: str | None = "error",
    overwrite: bool = False,
 ) -> None:
    """Encodes an audio file using ffmpeg."""
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    ffmpeg_args = OrderedDict(
        [
            ("-i", str(input_path)),
            ("-acodec", codec),
        ]
    )
    if log_level is not None:
        ffmpeg_args["-loglevel"] = str(log_level)
    ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
    if overwrite:
        ffmpeg_args.append("-y")
    ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(output_path)]
    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
    if not output_path.exists():
        raise OSError(
            f"Audio encoding did not work. File not found: {output_path}. "
            f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
        )
 def get_audio_info(video_path: Path | str) -> dict:
    ffprobe_audio_cmd = [
        "ffprobe",
        "-v",
        "error",
        "-select_streams",
        "a:0",
        "-show_entries",
        "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
        "-of",
        "json",
        str(video_path),
    ]
    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
    info = json.loads(result.stdout)
    audio_stream_info = info["streams"][0] if info.get("streams") else None
    if audio_stream_info is None:
        return {"has_audio": False}
    # Return the information, defaulting to None if no audio stream is present
    return {
        "has_audio": True,
        "audio.channels": audio_stream_info.get("channels", None),
        "audio.codec": audio_stream_info.get("codec_name", None),
        "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
        "audio.sample_rate": int(audio_stream_info["sample_rate"])
        if audio_stream_info.get("sample_rate")
        else None,
        "audio.bit_depth": audio_stream_info.get("bit_depth", None),
        "audio.channel_layout": audio_stream_info.get("channel_layout", None),
    }
--- a/lerobot/common/datasets/compute_stats.py
+++ b/lerobot/common/datasets/compute_stats.py
@ -15,7 +15,7 @@
 # limitations under the License.
 import numpy as np
-from lerobot.common.datasets.utils import load_image_as_numpy
+from lerobot.common.datasets.utils import load_audio_from_path, load_image_as_numpy
 def estimate_num_samples(
@ -72,6 +72,20 @@ def sample_images(image_paths: list[str]) -> np.ndarray:
    return images
 def sample_audio_from_path(audio_path: str) -> np.ndarray:
    """Samples audio data from an audio recording stored in a WAV file."""
    data = load_audio_from_path(audio_path)
    sampled_indices = sample_indices(len(data))
    return data[sampled_indices]
 def sample_audio_from_data(data: np.ndarray) -> np.ndarray:
    """Samples audio data from an audio recording stored in a numpy array."""
    sampled_indices = sample_indices(len(data))
    return data[sampled_indices]
 def get_feature_stats(array: np.ndarray, axis: tuple, keepdims: bool) -> dict[str, np.ndarray]:
    return {
        "min": np.min(array, axis=axis, keepdims=keepdims),
@ -91,6 +105,13 @@ def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], featu
            ep_ft_array = sample_images(data)  # data is a list of image paths
            axes_to_reduce = (0, 2, 3)  # keep channel dim
            keepdims = True
        elif features[key]["dtype"] == "audio":
            try:
                ep_ft_array = sample_audio_from_path(data[0])
            except TypeError:  # Should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner
                ep_ft_array = sample_audio_from_data(data)
            axes_to_reduce = 0
            keepdims = True
        else:
            ep_ft_array = data  # data is already a np.ndarray
            axes_to_reduce = 0  # compute stats over the first axis
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@ -23,6 +23,7 @@ import datasets
 import numpy as np
 import packaging.version
 import PIL.Image
 import soundfile as sf
 import torch
 import torch.utils
 from datasets import concatenate_datasets, load_dataset
@ -31,11 +32,18 @@ from huggingface_hub.constants import REPOCARD_NAME
 from huggingface_hub.errors import RevisionNotFoundError
 from lerobot.common.constants import HF_LEROBOT_HOME
 from lerobot.common.datasets.audio_utils import (
    decode_audio,
    encode_audio,
    get_audio_info,
 )
 from lerobot.common.datasets.compute_stats import aggregate_stats, compute_episode_stats
 from lerobot.common.datasets.image_writer import AsyncImageWriter, write_image
 from lerobot.common.datasets.utils import (
    DEFAULT_AUDIO_CHUNK_DURATION,
    DEFAULT_FEATURES,
    DEFAULT_IMAGE_PATH,
    DEFAULT_RAW_AUDIO_PATH,
    INFO_PATH,
    TASKS_PATH,
    append_jsonlines,
@ -72,6 +80,7 @@ from lerobot.common.datasets.video_utils import (
    get_safe_default_codec,
    get_video_info,
 )
 from lerobot.common.robot_devices.microphones.utils import Microphone
 from lerobot.common.robot_devices.robots.utils import Robot
 CODEBASE_VERSION = "v2.1"
@ -142,6 +151,14 @@ class LeRobotDatasetMetadata:
        fpath = self.video_path.format(episode_chunk=ep_chunk, video_key=vid_key, episode_index=ep_index)
        return Path(fpath)
    def get_compressed_audio_file_path(self, episode_index: int, audio_key: str) -> Path:
        """Returns the path of the compressed (i.e. encoded) audio file."""
        episode_chunk = self.get_episode_chunk(episode_index)
        fpath = self.audio_path.format(
            episode_chunk=episode_chunk, audio_key=audio_key, episode_index=episode_index
        )
        return self.root / fpath
    def get_episode_chunk(self, ep_index: int) -> int:
        return ep_index // self.chunks_size
@ -155,6 +172,11 @@ class LeRobotDatasetMetadata:
        """Formattable string for the video files."""
        return self.info["video_path"]
    @property
    def audio_path(self) -> str | None:
        """Formattable string for the audio files."""
        return self.info["audio_path"]
    @property
    def robot_type(self) -> str | None:
        """Robot type used in recording this dataset."""
@ -185,6 +207,11 @@ class LeRobotDatasetMetadata:
        """Keys to access visual modalities (regardless of their storage method)."""
        return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]]
    @property
    def audio_keys(self) -> list[str]:
        """Keys to access audio modalities."""
        return [key for key, ft in self.features.items() if ft["dtype"] == "audio"]
    @property
    def names(self) -> dict[str, list | dict]:
        """Names of the various dimensions of vector modalities."""
@ -264,6 +291,10 @@ class LeRobotDatasetMetadata:
        if len(self.video_keys) > 0:
            self.update_video_info()
        self.info["total_audio"] += len(self.audio_keys)
        if len(self.audio_keys) > 0:
            self.update_audio_info()
        write_info(self.info, self.root)
        episode_dict = {
@ -288,6 +319,19 @@ class LeRobotDatasetMetadata:
                video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key)
                self.info["features"][key]["info"] = get_video_info(video_path)
    def update_audio_info(self) -> None:
        """
        Warning: this function writes info from first episode audio, implicitly assuming that all audio have
        been encoded the same way. Also, this means it assumes the first episode exists.
        """
        for key in self.audio_keys:
            if (
                not self.features[key].get("info", None)
                or (len(self.features[key]["info"]) == 1 and "sample_rate" in self.features[key]["info"])
            ):  # Overwrite if info is empty or only contains sample rate (necessary to correctly save audio files recorded by LeKiwi)
                audio_path = self.root / self.get_compressed_audio_file_path(0, key)
                self.info["features"][key]["info"] = get_audio_info(audio_path)
    def __repr__(self):
        feature_keys = list(self.features)
        return (
@ -363,7 +407,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
        revision: str | None = None,
        force_cache_sync: bool = False,
        download_videos: bool = True,
        download_audio: bool = True,
        video_backend: str | None = None,
        audio_backend: str | None = None,
    ):
        """
        2 modes are available for instantiating this class, depending on 2 different use cases:
@ -394,7 +440,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
                - tasks contains the prompts for each task of the dataset, which can be used for
                  task-conditioned training.
            - hf_dataset (from datasets.Dataset), which will read any values from parquet files.
-            - videos (optional) from which frames are loaded to be synchronous with data from parquet files.
+            - videos (optional) from which frames and audio (if any) are loaded to be synchronous with data from parquet files and audio.
            - audio (optional) from which audio is loaded to be synchronous with data from parquet files and videos.
        A typical LeRobotDataset looks like this from its root path:
        .
@ -415,17 +462,31 @@ class LeRobotDataset(torch.utils.data.Dataset):
        │   ├── info.json
        │   ├── stats.json
        │   └── tasks.jsonl
-        └── videos
+        ├── videos
        │   ├── chunk-000
        │   │   ├── observation.images.laptop
        │   │   │   ├── episode_000000.mp4
        │   │   │   ├── episode_000001.mp4
        │   │   │   ├── episode_000002.mp4
        │   │   │   └── ...
        │   │   ├── observation.images.phone
        │   │   │   ├── episode_000000.mp4
        │   │   │   ├── episode_000001.mp4
        │   │   │   ├── episode_000002.mp4
        │   │   │   └── ...
        │   ├── chunk-001
        │   └── ...
        └── audio
            ├── chunk-000
-            │   ├── observation.images.laptop
+            │   ├── observation.audio.laptop
-            │   │   ├── episode_000000.mp4
+            │   │   ├── episode_000000.m4a
-            │   │   ├── episode_000001.mp4
+            │   │   ├── episode_000001.m4a
-            │   │   ├── episode_000002.mp4
+            │   │   ├── episode_000002.m4a
            │   │   └── ...
-            │   ├── observation.images.phone
+            │   ├── observation.audio.phone
-            │   │   ├── episode_000000.mp4
+            │   │   ├── episode_000000.m4a
-            │   │   ├── episode_000001.mp4
+            │   │   ├── episode_000001.m4a
-            │   │   ├── episode_000002.mp4
+            │   │   ├── episode_000002.m4a
            │   │   └── ...
            ├── chunk-001
            └── ...
@ -463,8 +524,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
            download_videos (bool, optional): Flag to download the videos. Note that when set to True but the
                video files are already present on local disk, they won't be downloaded again. Defaults to
                True.
            download_audio (bool, optional): Flag to download the audio (see download_videos). Defaults to True.
            video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to torchcodec when available int the platform; otherwise, defaults to 'pyav'.
                You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
            audio_backend (str | None, optional): Audio backend to use for decoding audio. Defaults to 'ffmpeg' decoder used by 'torchaudio'.
        """
        super().__init__()
        self.repo_id = repo_id
@ -475,6 +538,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
        self.tolerance_s = tolerance_s
        self.revision = revision if revision else CODEBASE_VERSION
        self.video_backend = video_backend if video_backend else get_safe_default_codec()
        self.audio_backend = (
            audio_backend if audio_backend else "ffmpeg"
        )  # Waiting for torchcodec release #TODO(CarolinePascal)
        self.delta_indices = None
        # Unused attributes
@ -499,7 +565,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
            self.hf_dataset = self.load_hf_dataset()
        except (AssertionError, FileNotFoundError, NotADirectoryError):
            self.revision = get_safe_version(self.repo_id, self.revision)
-            self.download_episodes(download_videos)
+            self.download_episodes(download_videos, download_audio)
            self.hf_dataset = self.load_hf_dataset()
        self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes)
@ -510,6 +576,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
        ep_data_index_np = {k: t.numpy() for k, t in self.episode_data_index.items()}
        check_timestamps_sync(timestamps, episode_indices, ep_data_index_np, self.fps, self.tolerance_s)
        # TODO(CarolinePascal) : add check for audio duration with respect to episode duration BUT this will be CPU expensive if there are many episodes !
        # Setup delta_indices
        if self.delta_timestamps is not None:
            check_delta_timestamps(self.delta_timestamps, self.fps, self.tolerance_s)
@ -522,6 +590,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        license: str | None = "apache-2.0",
        tag_version: bool = True,
        push_videos: bool = True,
        push_audio: bool = True,
        private: bool = False,
        allow_patterns: list[str] | str | None = None,
        upload_large_folder: bool = False,
@ -530,6 +599,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
        ignore_patterns = ["images/"]
        if not push_videos:
            ignore_patterns.append("videos/")
        if not push_audio:
            ignore_patterns.append("audio/")
        hub_api = HfApi()
        hub_api.create_repo(
@ -585,7 +656,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
            ignore_patterns=ignore_patterns,
        )
-    def download_episodes(self, download_videos: bool = True) -> None:
+    def download_episodes(self, download_videos: bool = True, download_audio: bool = True) -> None:
        """Downloads the dataset from the given 'repo_id' at the provided version. If 'episodes' is given, this
        will only download those episodes (selected by their episode_index). If 'episodes' is None, the whole
        dataset will be downloaded. Thanks to the behavior of snapshot_download, if the files are already present
@ -594,7 +665,11 @@ class LeRobotDataset(torch.utils.data.Dataset):
        # TODO(rcadene, aliberts): implement faster transfer
        # https://huggingface.co/docs/huggingface_hub/en/guides/download#faster-downloads
        files = None
-        ignore_patterns = None if download_videos else "videos/"
+        ignore_patterns = []
        if not download_videos:
            ignore_patterns.append("videos/")
        if not download_audio:
            ignore_patterns.append("audio/")
        if self.episodes is not None:
            files = self.get_episodes_file_paths()
@ -611,6 +686,14 @@ class LeRobotDataset(torch.utils.data.Dataset):
            ]
            fpaths += video_files
        if len(self.meta.audio_keys) > 0:
            audio_files = [
                str(self.meta.get_compressed_audio_file_path(ep_idx, audio_key))
                for audio_key in self.meta.audio_keys
                for ep_idx in episodes
            ]
            fpaths += audio_files
        return fpaths
    def load_hf_dataset(self) -> datasets.Dataset:
@ -677,7 +760,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        }
        return query_indices, padding
-    def _get_query_timestamps(
+    def _get_query_timestamps_video(
        self,
        current_ts: float,
        query_indices: dict[str, list[int]] | None = None,
@ -692,11 +775,27 @@ class LeRobotDataset(torch.utils.data.Dataset):
        return query_timestamps
    # TODO(CarolinePascal): add variable query durations
    def _get_query_timestamps_audio(
        self,
        current_ts: float,
        query_indices: dict[str, list[int]] | None = None,
    ) -> dict[str, list[float]]:
        query_timestamps = {}
        for key in self.meta.audio_keys:
            if query_indices is not None and key in query_indices:
                timestamps = self.hf_dataset.select(query_indices[key])["timestamp"]
                query_timestamps[key] = torch.stack(timestamps).tolist()
            else:
                query_timestamps[key] = [current_ts]
        return query_timestamps
    def _query_hf_dataset(self, query_indices: dict[str, list[int]]) -> dict:
        return {
            key: torch.stack(self.hf_dataset.select(q_idx)[key])
            for key, q_idx in query_indices.items()
-            if key not in self.meta.video_keys
+            if key not in self.meta.video_keys and key not in self.meta.audio_keys
        }
    def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict[str, torch.Tensor]:
@ -713,6 +812,17 @@ class LeRobotDataset(torch.utils.data.Dataset):
        return item
    # TODO(CarolinePascal): add variable query durations
    def _query_audio(
        self, query_timestamps: dict[str, list[float]], query_duration: float, ep_idx: int
    ) -> dict[str, torch.Tensor]:
        item = {}
        for audio_key, query_ts in query_timestamps.items():
            audio_path = self.root / self.meta.get_compressed_audio_file_path(ep_idx, audio_key)
            audio_chunk = decode_audio(audio_path, query_ts, query_duration, self.audio_backend)
            item[audio_key] = audio_chunk.squeeze(0)
        return item
    def _add_padding_keys(self, item: dict, padding: dict[str, list[bool]]) -> dict:
        for key, val in padding.items():
            item[key] = torch.BoolTensor(val)
@ -733,11 +843,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
            for key, val in query_result.items():
                item[key] = val
-        if len(self.meta.video_keys) > 0:
+        if len(self.meta.video_keys) > 0 or len(self.meta.audio_keys) > 0:
            current_ts = item["timestamp"].item()
-            query_timestamps = self._get_query_timestamps(current_ts, query_indices)
+
            query_timestamps = self._get_query_timestamps_video(current_ts, query_indices)
            video_frames = self._query_videos(query_timestamps, ep_idx)
-            item = {**video_frames, **item}
+            item = {**item, **video_frames}
            query_timestamps = self._get_query_timestamps_audio(current_ts, query_indices)
            audio_chunks = self._query_audio(query_timestamps, DEFAULT_AUDIO_CHUNK_DURATION, ep_idx)
            item = {**item, **audio_chunks}
        if self.image_transforms is not None:
            image_keys = self.meta.camera_keys
@ -777,6 +892,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
        )
        return self.root / fpath
    def _get_raw_audio_file_path(self, episode_index: int, audio_key: str) -> Path:
        fpath = DEFAULT_RAW_AUDIO_PATH.format(audio_key=audio_key, episode_index=episode_index)
        return self.root / fpath
    def _save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path) -> None:
        if self.image_writer is None:
            if isinstance(image, torch.Tensor):
@ -827,11 +946,39 @@ class LeRobotDataset(torch.utils.data.Dataset):
                    img_path.parent.mkdir(parents=True, exist_ok=True)
                self._save_image(frame[key], img_path)
                self.episode_buffer[key].append(str(img_path))
            elif self.features[key]["dtype"] == "audio":
                if (
                    self.meta.robot_type is not None and self.meta.robot_type.startswith("lekiwi")
                ):  # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner
                    self.episode_buffer[key].append(frame[key])
                else:  # Otherwise, only the audio file path is stored in the episode buffer
                    if frame_index == 0:
                        audio_path = self._get_raw_audio_file_path(
                            episode_index=self.episode_buffer["episode_index"], audio_key=key
                        )
                        self.episode_buffer[key].append(str(audio_path))
            else:
                self.episode_buffer[key].append(frame[key])
        self.episode_buffer["size"] += 1
    def add_microphone_recording(self, microphone: Microphone, microphone_key: str) -> None:
        """
        Starts recording audio data provided by the microphone and directly writes it in a .wav file.
        """
        audio_dir = self._get_raw_audio_file_path(
            self.num_episodes, "observation.audio." + microphone_key
        ).parent
        if not audio_dir.is_dir():
            audio_dir.mkdir(parents=True, exist_ok=True)
        microphone.start_recording(
            output_file=self._get_raw_audio_file_path(
                self.num_episodes, "observation.audio." + microphone_key
            )
        )
    def save_episode(self, episode_data: dict | None = None) -> None:
        """
        This will save to disk the current episode in self.episode_buffer.
@ -869,16 +1016,39 @@ class LeRobotDataset(torch.utils.data.Dataset):
            # are processed separately by storing image path and frame info as meta data
            if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]:
                continue
            elif ft["dtype"] == "audio":
                if (
                    self.meta.robot_type is not None and self.meta.robot_type.startswith("lekiwi")
                ):  # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner
                    episode_buffer[key] = np.concatenate(episode_buffer[key], axis=0)
                continue
            episode_buffer[key] = np.stack(episode_buffer[key])
        self._wait_image_writer()
        self._save_episode_table(episode_buffer, episode_index)
        if (
            self.meta.robot_type is not None and self.meta.robot_type.startswith("lekiwi")
        ):  # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner
            for key in self.meta.audio_keys:
                audio_path = self._get_raw_audio_file_path(
                    episode_index=self.episode_buffer["episode_index"][0], audio_key=key
                )
                with sf.SoundFile(
                    audio_path,
                    mode="w",
                    samplerate=self.meta.features[key]["info"]["sample_rate"],
                    channels=self.meta.features[key]["shape"][0],
                ) as file:
                    file.write(episode_buffer[key])
        ep_stats = compute_episode_stats(episode_buffer, self.features)
        if len(self.meta.video_keys) > 0:
-            video_paths = self.encode_episode_videos(episode_index)
+            self.encode_episode_videos(episode_index)
-            for key in self.meta.video_keys:
+
-                episode_buffer[key] = video_paths[key]
+        if len(self.meta.audio_keys) > 0:
            self.encode_episode_audio(episode_index)
        # `meta.save_episode` be executed after encoding the videos
        self.meta.save_episode(episode_index, episode_length, episode_tasks, ep_stats)
@ -904,6 +1074,13 @@ class LeRobotDataset(torch.utils.data.Dataset):
        if img_dir.is_dir():
            shutil.rmtree(self.root / "images")
        # delete raw audio files
        raw_audio_files = list(self.root.rglob("*.wav"))
        for raw_audio_file in raw_audio_files:
            raw_audio_file.unlink()
            if len(list(raw_audio_file.parent.iterdir())) == 0:
                raw_audio_file.parent.rmdir()
        if not episode_data:  # Reset the buffer
            self.episode_buffer = self.create_episode_buffer()
@ -971,19 +1148,40 @@ class LeRobotDataset(torch.utils.data.Dataset):
        since video encoding with ffmpeg is already using multithreading.
        """
        video_paths = {}
-        for key in self.meta.video_keys:
+        for video_key in self.meta.video_keys:
-            video_path = self.root / self.meta.get_video_file_path(episode_index, key)
+            video_path = self.root / self.meta.get_video_file_path(episode_index, video_key)
-            video_paths[key] = str(video_path)
+            video_paths[video_key] = str(video_path)
            if video_path.is_file():
                # Skip if video is already encoded. Could be the case when resuming data recording.
                continue
            img_dir = self._get_image_file_path(
-                episode_index=episode_index, image_key=key, frame_index=0
+                episode_index=episode_index, image_key=video_key, frame_index=0
            ).parent
            encode_video_frames(img_dir, video_path, self.fps, overwrite=True)
        return video_paths
    def encode_episode_audio(self, episode_index: int) -> dict:
        """
        Use ffmpeg to convert .wav raw audio files into .m4a audio files.
        Note: `encode_episode_audio` is a blocking call. Making it asynchronous shouldn't speedup encoding,
        since video encoding with ffmpeg is already using multithreading.
        """
        audio_paths = {}
        for audio_key in self.meta.audio_keys:
            input_audio_path = self.root / self._get_raw_audio_file_path(episode_index, audio_key)
            output_audio_path = self.root / self.meta.get_compressed_audio_file_path(episode_index, audio_key)
            audio_paths[audio_key] = str(output_audio_path)
            if output_audio_path.is_file():
                # Skip if video is already encoded. Could be the case when resuming data recording.
                continue
            encode_audio(input_audio_path, output_audio_path, overwrite=True)
        return audio_paths
    @classmethod
    def create(
        cls,
@ -998,6 +1196,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
        image_writer_processes: int = 0,
        image_writer_threads: int = 0,
        video_backend: str | None = None,
        audio_backend: str | None = None,
    ) -> "LeRobotDataset":
        """Create a LeRobot Dataset from scratch in order to record data."""
        obj = cls.__new__(cls)
@ -1029,6 +1228,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
        obj.delta_indices = None
        obj.episode_data_index = None
        obj.video_backend = video_backend if video_backend is not None else get_safe_default_codec()
        obj.audio_backend = (
            audio_backend if audio_backend is not None else "ffmpeg"
        )  # Waiting for torchcodec release #TODO(CarolinePascal)
        return obj
@ -1049,6 +1251,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
        tolerances_s: dict | None = None,
        download_videos: bool = True,
        video_backend: str | None = None,
        audio_backend: str | None = None,
    ):
        super().__init__()
        self.repo_ids = repo_ids
@ -1066,6 +1269,7 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
                tolerance_s=self.tolerances_s[repo_id],
                download_videos=download_videos,
                video_backend=video_backend,
                audio_backend=audio_backend,
            )
            for repo_id in repo_ids
        ]
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@ -33,6 +33,7 @@ from datasets.table import embed_table_storage
 from huggingface_hub import DatasetCard, DatasetCardData, HfApi
 from huggingface_hub.errors import RevisionNotFoundError
 from PIL import Image as PILImage
 from soundfile import read
 from torchvision import transforms
 from lerobot.common.datasets.backward_compatibility import (
@ -55,6 +56,10 @@ TASKS_PATH = "meta/tasks.jsonl"
 DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
 DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
 DEFAULT_IMAGE_PATH = "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png"
 DEFAULT_RAW_AUDIO_PATH = "audio/{audio_key}/episode_{episode_index:06d}.wav"
 DEFAULT_COMPRESSED_AUDIO_PATH = "audio/chunk-{episode_chunk:03d}/{audio_key}/episode_{episode_index:06d}.m4a"
 DEFAULT_AUDIO_CHUNK_DURATION = 0.5  # seconds
 DATASET_CARD_TEMPLATE = """
 ---
@ -255,6 +260,11 @@ def load_image_as_numpy(
    return img_array
 def load_audio_from_path(fpath: str | Path) -> np.ndarray:
    audio_data, _ = read(fpath, dtype="float32")
    return audio_data
 def hf_transform_to_torch(items_dict: dict[torch.Tensor | None]):
    """Get a transform function that convert items from Hugging Face dataset (pyarrow)
    to torch tensors. Importantly, images are converted from PIL, which corresponds to
@ -363,7 +373,7 @@ def get_safe_version(repo_id: str, version: str | packaging.version.Version) ->
 def get_hf_features_from_features(features: dict) -> datasets.Features:
    hf_features = {}
    for key, ft in features.items():
-        if ft["dtype"] == "video":
+        if ft["dtype"] == "video" or ft["dtype"] == "audio":
            continue
        elif ft["dtype"] == "image":
            hf_features[key] = datasets.Image()
@ -394,7 +404,7 @@ def get_features_from_robot(robot: Robot, use_videos: bool = True) -> dict:
            key: {"dtype": "video" if use_videos else "image", **ft}
            for key, ft in robot.camera_features.items()
        }
-    return {**robot.motor_features, **camera_ft, **DEFAULT_FEATURES}
+    return {**robot.motor_features, **camera_ft, **robot.microphone_features, **DEFAULT_FEATURES}
 def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFeature]:
@ -442,12 +452,14 @@ def create_empty_dataset_info(
        "total_frames": 0,
        "total_tasks": 0,
        "total_videos": 0,
        "total_audio": 0,
        "total_chunks": 0,
        "chunks_size": DEFAULT_CHUNK_SIZE,
        "fps": fps,
        "splits": {},
        "data_path": DEFAULT_PARQUET_PATH,
        "video_path": DEFAULT_VIDEO_PATH if use_videos else None,
        "audio_path": DEFAULT_COMPRESSED_AUDIO_PATH,
        "features": features,
    }
@ -721,6 +733,7 @@ def validate_features_presence(
 ):
    error_message = ""
    missing_features = expected_features - actual_features
    missing_features = {feature for feature in missing_features if "observation.audio" not in feature}
    extra_features = actual_features - (expected_features | optional_features)
    if missing_features or extra_features:
@ -740,6 +753,8 @@ def validate_feature_dtype_and_shape(name: str, feature: dict, value: np.ndarray
        return validate_feature_numpy_array(name, expected_dtype, expected_shape, value)
    elif expected_dtype in ["image", "video"]:
        return validate_feature_image_or_video(name, expected_shape, value)
    elif expected_dtype == "audio":
        return validate_feature_audio(name, expected_shape, value)
    elif expected_dtype == "string":
        return validate_feature_string(name, value)
    else:
@ -781,6 +796,23 @@ def validate_feature_image_or_video(name: str, expected_shape: list[str], value:
    return error_message
 def validate_feature_audio(name: str, expected_shape: list[str], value: np.ndarray):
    error_message = ""
    if isinstance(value, np.ndarray):
        actual_shape = value.shape
        c = expected_shape
        if len(actual_shape) != 2 or (
            actual_shape[-1] != c[-1] and actual_shape[0] != c[0]
        ):  # The number of frames might be different
            error_message += (
                f"The feature '{name}' of shape '{actual_shape}' does not have the expected shape '{(c,)}'.\n"
            )
    else:
        error_message += f"The feature '{name}' is expected to be of type 'np.ndarray', but type '{type(value)}' provided instead.\n"
    return error_message
 def validate_feature_string(name: str, value: str):
    if not isinstance(value, str):
        return f"The feature '{name}' is expected to be of type 'str', but type '{type(value)}' provided instead.\n"
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@ -260,35 +260,39 @@ def encode_video_frames(
    imgs_dir = Path(imgs_dir)
    video_path.parent.mkdir(parents=True, exist_ok=True)
-    ffmpeg_args = OrderedDict(
+    ffmpeg_video_args = OrderedDict(
        [
            ("-f", "image2"),
            ("-r", str(fps)),
-            ("-i", str(imgs_dir / "frame_%06d.png")),
+            ("-i", str(Path(imgs_dir) / "frame_%06d.png")),
            ("-vcodec", vcodec),
            ("-pix_fmt", pix_fmt),
        ]
    )
    ffmpeg_encoding_args = OrderedDict(
        [
            ("-pix_fmt", pix_fmt),
            ("-vcodec", vcodec),
        ]
    )
    if g is not None:
-        ffmpeg_args["-g"] = str(g)
+        ffmpeg_encoding_args["-g"] = str(g)
    if crf is not None:
-        ffmpeg_args["-crf"] = str(crf)
+        ffmpeg_encoding_args["-crf"] = str(crf)
    if fast_decode:
        key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune"
        value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
-        ffmpeg_args[key] = value
+        ffmpeg_encoding_args[key] = value
    if log_level is not None:
-        ffmpeg_args["-loglevel"] = str(log_level)
+        ffmpeg_encoding_args["-loglevel"] = str(log_level)
-    ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
+    ffmpeg_args = [item for pair in ffmpeg_video_args.items() for item in pair]
    ffmpeg_args += [item for pair in ffmpeg_encoding_args.items() for item in pair]
    if overwrite:
        ffmpeg_args.append("-y")
    ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)]
    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
@ -331,42 +335,6 @@ with warnings.catch_warnings():
    register_feature(VideoFrame, "VideoFrame")
 def get_audio_info(video_path: Path | str) -> dict:
    ffprobe_audio_cmd = [
        "ffprobe",
        "-v",
        "error",
        "-select_streams",
        "a:0",
        "-show_entries",
        "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
        "-of",
        "json",
        str(video_path),
    ]
    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
    info = json.loads(result.stdout)
    audio_stream_info = info["streams"][0] if info.get("streams") else None
    if audio_stream_info is None:
        return {"has_audio": False}
    # Return the information, defaulting to None if no audio stream is present
    return {
        "has_audio": True,
        "audio.channels": audio_stream_info.get("channels", None),
        "audio.codec": audio_stream_info.get("codec_name", None),
        "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
        "audio.sample_rate": int(audio_stream_info["sample_rate"])
        if audio_stream_info.get("sample_rate")
        else None,
        "audio.bit_depth": audio_stream_info.get("bit_depth", None),
        "audio.channel_layout": audio_stream_info.get("channel_layout", None),
    }
 def get_video_info(video_path: Path | str) -> dict:
    ffprobe_video_cmd = [
        "ffprobe",
@ -402,7 +370,6 @@ def get_video_info(video_path: Path | str) -> dict:
        "video.codec": video_stream_info["codec_name"],
        "video.pix_fmt": video_stream_info["pix_fmt"],
        "video.is_depth_map": False,
        **get_audio_info(video_path),
    }
    return video_info
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@ -78,6 +78,11 @@ def log_control_info(robot: Robot, dt_s, episode_index=None, frame_index=None, f
            if key in robot.logs:
                log_dt(f"dtR{name}", robot.logs[key])
        for name in robot.microphones:
            key = f"read_microphone_{name}_dt_s"
            if key in robot.logs:
                log_dt(f"dtR{name}", robot.logs[key])
    info_str = " ".join(log_items)
    logging.info(info_str)
@ -107,11 +112,15 @@ def predict_action(observation, policy, device, use_amp):
        torch.inference_mode(),
        torch.autocast(device_type=device.type) if device.type == "cuda" and use_amp else nullcontext(),
    ):
        # Convert to pytorch format: channel first and float32 in [0,1] with batch dimension
        for name in observation:
            # Convert to pytorch format: channel first and float32 in [0,1] with batch dimension
            if "image" in name:
                observation[name] = observation[name].type(torch.float32) / 255
                observation[name] = observation[name].permute(2, 0, 1).contiguous()
            # Convert to pytorch format: channel first and float32 in [-1,1] with batch dimension
            if "audio" in name:
                observation[name] = observation[name].type(torch.float32)
                observation[name] = observation[name].permute(1, 0).contiguous()
            observation[name] = observation[name].unsqueeze(0)
            observation[name] = observation[name].to(device)
@ -243,6 +252,18 @@ def control_loop(
    timestamp = 0
    start_episode_t = time.perf_counter()
    if (
        dataset is not None and not robot.robot_type.startswith("lekiwi")
    ):  # For now, LeKiwi only supports frame audio recording (which may lead to audio chunks loss, extended post-processing, increased memory usage)
        for microphone_key, microphone in robot.microphones.items():
            # Start recording both in file writing and data reading mode
            dataset.add_microphone_recording(microphone, microphone_key)
    else:
        for _, microphone in robot.microphones.items():
            # Start recording only in data reading mode
            microphone.start_recording()
    while timestamp < control_time_s:
        start_loop_t = time.perf_counter()
@ -286,6 +307,9 @@ def control_loop(
            events["exit_early"] = False
            break
    for _, microphone in robot.microphones.items():
        microphone.stop_recording()
 def reset_environment(robot, events, reset_time_s, fps):
    # TODO(rcadene): refactor warmup_record and reset_environment
--- a/lerobot/common/robot_devices/microphones/configs.py
+++ b/lerobot/common/robot_devices/microphones/configs.py
@ -0,0 +1,38 @@
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import abc
 from dataclasses import dataclass
 import draccus
@dataclass
 class MicrophoneConfigBase(draccus.ChoiceRegistry, abc.ABC):
    @property
    def type(self) -> str:
        return self.get_choice_name(self.__class__)
@MicrophoneConfigBase.register_subclass("microphone")
@dataclass
 class MicrophoneConfig(MicrophoneConfigBase):
    """
    Dataclass for microphone configuration.
    """
    microphone_index: int
    sample_rate: int | None = None
    channels: list[int] | None = None
    mock: bool = False
--- a/lerobot/common/robot_devices/microphones/microphone.py
+++ b/lerobot/common/robot_devices/microphones/microphone.py
@ -0,0 +1,425 @@
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file contains utilities for recording audio from a microhone.
 """
 import argparse
 import logging
 import shutil
 import time
 from multiprocessing import Event as process_Event
 from multiprocessing import JoinableQueue as process_Queue
 from multiprocessing import Process
 from os import getcwd
 from pathlib import Path
 from queue import Empty
 from queue import Queue as thread_Queue
 from threading import Event, Thread
 from threading import Event as thread_Event
 import numpy as np
 import soundfile as sf
 from lerobot.common.robot_devices.microphones.configs import MicrophoneConfig
 from lerobot.common.robot_devices.utils import (
    RobotDeviceAlreadyConnectedError,
    RobotDeviceAlreadyRecordingError,
    RobotDeviceNotConnectedError,
    RobotDeviceNotRecordingError,
 )
 from lerobot.common.utils.utils import capture_timestamp_utc
 def find_microphones(raise_when_empty=False, mock=False) -> list[dict]:
    """
    Finds and lists all microphones compatible with sounddevice (and the underlying PortAudio library).
    Most microphones and sound cards are compatible, across all OS (Linux, Mac, Windows).
    """
    microphones = []
    if mock:
        import tests.microphones.mock_sounddevice as sd
    else:
        import sounddevice as sd
    devices = sd.query_devices()
    for device in devices:
        if device["max_input_channels"] > 0:
            microphones.append(
                {
                    "index": device["index"],
                    "name": device["name"],
                }
            )
    if raise_when_empty and len(microphones) == 0:
        raise OSError(
            "Not a single microphone was detected. Try re-plugging the microphone or check the microphone settings."
        )
    return microphones
 def record_audio_from_microphones(
    output_dir: Path, microphone_ids: list[int] | None = None, record_time_s: float = 2.0
 ):
    """
    Records audio from all the channels of the specified microphones for the specified duration.
    If no microphone ids are provided, all available microphones will be used.
    """
    if microphone_ids is None or len(microphone_ids) == 0:
        microphones = find_microphones()
        microphone_ids = [m["index"] for m in microphones]
    microphones = []
    for microphone_id in microphone_ids:
        config = MicrophoneConfig(microphone_index=microphone_id)
        microphone = Microphone(config)
        microphone.connect()
        print(
            f"Recording audio from microphone {microphone_id} for {record_time_s} seconds at {microphone.sample_rate} Hz."
        )
        microphones.append(microphone)
    output_dir = Path(output_dir)
    if output_dir.exists():
        shutil.rmtree(
            output_dir,
        )
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"Saving audio to {output_dir}")
    for microphone in microphones:
        microphone.start_recording(getcwd() / output_dir / f"microphone_{microphone.microphone_index}.wav")
    time.sleep(record_time_s)
    for microphone in microphones:
        microphone.stop_recording()
    # Remark : recording may be resumed here if needed
    for microphone in microphones:
        microphone.disconnect()
    print(f"Images have been saved to {output_dir}")
 class Microphone:
    """
    The Microphone class handles all microphones compatible with sounddevice (and the underlying PortAudio library). Most microphones and sound cards are compatible, across all OS (Linux, Mac, Windows).
    A Microphone instance requires the sounddevice index of the microphone, which may be obtained using `python -m sounddevice`. It also requires the recording sample rate as well as the list of recorded channels.
    Example of usage:
    ```python
    from lerobot.common.robot_devices.microphones.configs import MicrophoneConfig
    config = MicrophoneConfig(microphone_index=0, sample_rate=16000, channels=[1])
    microphone = Microphone(config)
    microphone.connect()
    microphone.start_recording("some/output/file.wav")
    ...
    audio_readings = microphone.read()  #Gets all recorded audio data since the last read or since the beginning of the recording
    ...
    microphone.stop_recording()
    microphone.disconnect()
    ```
    """
    def __init__(self, config: MicrophoneConfig):
        self.config = config
        self.microphone_index = config.microphone_index
        # Store the recording sample rate and channels
        self.sample_rate = config.sample_rate
        self.channels = config.channels
        self.mock = config.mock
        # Input audio stream
        self.stream = None
        # Thread/Process-safe concurrent queue to store the recorded/read audio
        self.record_queue = None
        self.read_queue = None
        # Thread/Process to handle data reading and file writing in a separate thread/process (safely)
        self.record_thread = None
        self.record_stop_event = None
        self.logs = {}
        self.is_connected = False
        self.is_recording = False
        self.is_writing = False
    def connect(self) -> None:
        """
        Connects the microphone and checks if the requested acquisition parameters are compatible with the microphone.
        """
        if self.is_connected:
            raise RobotDeviceAlreadyConnectedError(
                f"Microphone {self.microphone_index} is already connected."
            )
        if self.mock:
            import tests.microphones.mock_sounddevice as sd
        else:
            import sounddevice as sd
        # Check if the provided microphone index does match an input device
        is_index_input = sd.query_devices(self.microphone_index)["max_input_channels"] > 0
        if not is_index_input:
            microphones_info = find_microphones()
            available_microphones = [m["index"] for m in microphones_info]
            raise OSError(
                f"Microphone index {self.microphone_index} does not match an input device (microphone). Available input devices : {available_microphones}"
            )
        # Check if provided recording parameters are compatible with the microphone
        actual_microphone = sd.query_devices(self.microphone_index)
        if self.sample_rate is not None:
            if self.sample_rate > actual_microphone["default_samplerate"]:
                raise OSError(
                    f"Provided sample rate {self.sample_rate} is higher than the sample rate of the microphone {actual_microphone['default_samplerate']}."
                )
            elif self.sample_rate < actual_microphone["default_samplerate"]:
                logging.warning(
                    "Provided sample rate is lower than the sample rate of the microphone. Performance may be impacted."
                )
        else:
            self.sample_rate = int(actual_microphone["default_samplerate"])
        if self.channels is not None and len(self.channels) > 0:
            if any(c > actual_microphone["max_input_channels"] for c in self.channels):
                raise OSError(
                    f"Some of the provided channels {self.channels} are outside the maximum channel range of the microphone {actual_microphone['max_input_channels']}."
                )
        else:
            self.channels = np.arange(1, actual_microphone["max_input_channels"] + 1)
        # Get channels index instead of number for slicing
        self.channels_index = np.array(self.channels) - 1
        # Create the audio stream
        self.stream = sd.InputStream(
            device=self.microphone_index,
            samplerate=self.sample_rate,
            channels=max(self.channels),
            dtype="float32",
            callback=self._audio_callback,
        )
        # Remark : the blocksize parameter could be passed to the stream to ensure that audio_callback always receive same length buffers.
        # However, this may lead to additional latency. We thus stick to blocksize=0 which means that audio_callback will receive varying length buffers, but with no additional latency.
        self.is_connected = True
    def _audio_callback(self, indata, frames, time, status) -> None:
        """
        Low-level sounddevice callback.
        """
        if status:
            logging.warning(status)
        # Slicing makes copy unnecessary
        # Two separate queues are necessary because .get() also pops the data from the queue
        if self.is_writing:
            self.record_queue.put(indata[:, self.channels_index])
        self.read_queue.put(indata[:, self.channels_index])
    @staticmethod
    def _record_loop(queue, event: Event, sample_rate: int, channels: list[int], output_file: Path) -> None:
        """
        Thread/Process-safe loop to write audio data into a file.
        """
        # Can only be run on a single process/thread for file writing safety
        with sf.SoundFile(
            output_file,
            mode="x",
            samplerate=sample_rate,
            channels=max(channels),
            subtype=sf.default_subtype(output_file.suffix[1:]),
        ) as file:
            while not event.is_set():
                try:
                    file.write(
                        queue.get(timeout=0.02)
                    )  # Timeout set as twice the usual sounddevice buffer size
                    queue.task_done()
                except Empty:
                    continue
    def _read(self) -> np.ndarray:
        """
        Thread/Process-safe callback to read available audio data
        """
        audio_readings = np.empty((0, len(self.channels)))
        while True:
            try:
                audio_readings = np.concatenate((audio_readings, self.read_queue.get_nowait()), axis=0)
            except Empty:
                break
        self.read_queue = thread_Queue()
        return audio_readings
    def read(self) -> np.ndarray:
        """
        Reads the last audio chunk recorded by the microphone, e.g. all samples recorded since the last read or since the beginning of the recording.
        """
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
        if not self.is_recording:
            raise RobotDeviceNotRecordingError(f"Microphone {self.microphone_index} is not recording.")
        start_time = time.perf_counter()
        audio_readings = self._read()
        # log the number of seconds it took to read the audio chunk
        self.logs["delta_timestamp_s"] = time.perf_counter() - start_time
        # log the utc time at which the audio chunk was received
        self.logs["timestamp_utc"] = capture_timestamp_utc()
        return audio_readings
    def start_recording(self, output_file: str | None = None, multiprocessing: bool | None = False) -> None:
        """
        Starts the recording of the microphone. If output_file is provided, the audio will be written to this file.
        """
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
        if self.is_recording:
            raise RobotDeviceAlreadyRecordingError(
                f"Microphone {self.microphone_index} is already recording."
            )
        # Reset queues
        self.read_queue = thread_Queue()
        if multiprocessing:
            self.record_queue = process_Queue()
        else:
            self.record_queue = thread_Queue()
        # Write recordings into a file if output_file is provided
        if output_file is not None:
            output_file = Path(output_file)
            if output_file.exists():
                output_file.unlink()
            if multiprocessing:
                self.record_stop_event = process_Event()
                self.record_thread = Process(
                    target=Microphone._record_loop,
                    args=(
                        self.record_queue,
                        self.record_stop_event,
                        self.sample_rate,
                        self.channels,
                        output_file,
                    ),
                )
            else:
                self.record_stop_event = thread_Event()
                self.record_thread = Thread(
                    target=Microphone._record_loop,
                    args=(
                        self.record_queue,
                        self.record_stop_event,
                        self.sample_rate,
                        self.channels,
                        output_file,
                    ),
                )
            self.record_thread.daemon = True
            self.record_thread.start()
            self.is_writing = True
        self.is_recording = True
        self.stream.start()
    def stop_recording(self) -> None:
        """
        Stops the recording of the microphones.
        """
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
        if not self.is_recording:
            raise RobotDeviceNotRecordingError(f"Microphone {self.microphone_index} is not recording.")
        if self.stream.active:
            self.stream.stop()  # Wait for all buffers to be processed
            # Remark : stream.abort() flushes the buffers !
        self.is_recording = False
        if self.record_thread is not None:
            self.record_queue.join()
            self.record_stop_event.set()
            self.record_thread.join()
            self.record_thread = None
            self.record_stop_event = None
        self.is_writing = False
    def disconnect(self) -> None:
        """
        Disconnects the microphone and stops the recording.
        """
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
        if self.is_recording:
            self.stop_recording()
        self.stream.close()
        self.is_connected = False
    def __del__(self):
        if getattr(self, "is_connected", False):
            self.disconnect()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Records audio using `Microphone` for all microphones connected to the computer, or a selected subset."
    )
    parser.add_argument(
        "--microphone-ids",
        type=int,
        nargs="*",
        default=None,
        help="List of microphones indices used to instantiate the `Microphone`. If not provided, find and use all available microphones indices.",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default="outputs/audio_from_microphones",
        help="Set directory to save an audio snippet for each microphone.",
    )
    parser.add_argument(
        "--record-time-s",
        type=float,
        default=4.0,
        help="Set the number of seconds used to record the audio. By default, 4 seconds.",
    )
    args = parser.parse_args()
    record_audio_from_microphones(**vars(args))
--- a/lerobot/common/robot_devices/microphones/utils.py
+++ b/lerobot/common/robot_devices/microphones/utils.py
@ -0,0 +1,48 @@
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Protocol
 from lerobot.common.robot_devices.microphones.configs import MicrophoneConfig, MicrophoneConfigBase
 # Defines a microphone type
 class Microphone(Protocol):
    def connect(self): ...
    def disconnect(self): ...
    def start_recording(self, output_file: str | None = None, multiprocessing: bool | None = False): ...
    def stop_recording(self): ...
 def make_microphones_from_configs(microphone_configs: dict[str, MicrophoneConfigBase]) -> list[Microphone]:
    microphones = {}
    for key, cfg in microphone_configs.items():
        if cfg.type == "microphone":
            from lerobot.common.robot_devices.microphones.microphone import Microphone
            microphones[key] = Microphone(cfg)
        else:
            raise ValueError(f"The microphone type '{cfg.type}' is not valid.")
    return microphones
 def make_microphone(microphone_type, **kwargs) -> Microphone:
    if microphone_type == "microphone":
        from lerobot.common.robot_devices.microphones.microphone import Microphone
        return Microphone(MicrophoneConfig(**kwargs))
    else:
        raise ValueError(f"The microphone type '{microphone_type}' is not valid.")
--- a/lerobot/common/robot_devices/robots/configs.py
+++ b/lerobot/common/robot_devices/robots/configs.py
@ -23,6 +23,7 @@ from lerobot.common.robot_devices.cameras.configs import (
    IntelRealSenseCameraConfig,
    OpenCVCameraConfig,
 )
 from lerobot.common.robot_devices.microphones.configs import MicrophoneConfig
 from lerobot.common.robot_devices.motors.configs import (
    DynamixelMotorsBusConfig,
    FeetechMotorsBusConfig,
@ -43,6 +44,7 @@ class ManipulatorRobotConfig(RobotConfig):
    leader_arms: dict[str, MotorsBusConfig] = field(default_factory=lambda: {})
    follower_arms: dict[str, MotorsBusConfig] = field(default_factory=lambda: {})
    cameras: dict[str, CameraConfig] = field(default_factory=lambda: {})
    microphones: dict[str, MicrophoneConfig] = field(default_factory=lambda: {})
    # Optionally limit the magnitude of the relative positional target vector for safety purposes.
    # Set this to a positive scalar to have the same value for all motors, or a list that is the same length
@ -68,6 +70,9 @@ class ManipulatorRobotConfig(RobotConfig):
            for cam in self.cameras.values():
                if not cam.mock:
                    cam.mock = True
            for mic in self.microphones.values():
                if not mic.mock:
                    mic.mock = True
        if self.max_relative_target is not None and isinstance(self.max_relative_target, Sequence):
            for name in self.follower_arms:
@ -491,6 +496,21 @@ class So100RobotConfig(ManipulatorRobotConfig):
        }
    )
    microphones: dict[str, MicrophoneConfig] = field(
        default_factory=lambda: {
            "laptop": MicrophoneConfig(
                microphone_index=0,
                sample_rate=48000,
                channels=[1],
            ),
            "headset": MicrophoneConfig(
                microphone_index=1,
                sample_rate=44100,
                channels=[1],
            ),
        }
    )
    mock: bool = False
--- a/lerobot/common/robot_devices/robots/lekiwi_remote.py
+++ b/lerobot/common/robot_devices/robots/lekiwi_remote.py
@ -52,6 +52,16 @@ def run_camera_capture(cameras, images_lock, latest_images_dict, stop_event):
        time.sleep(0.01)
 def run_microphone_capture(microphones, audio_lock, latest_audio_dict, stop_event):
    while not stop_event.is_set():
        local_dict = {}
        for name, microphone in microphones.items():
            audio_readings = microphone.read()
            local_dict[name] = audio_readings
        with audio_lock:
            latest_audio_dict.update(local_dict)
 def calibrate_follower_arm(motors_bus, calib_dir_str):
    """
    Calibrates the follower arm. Attempts to load an existing calibration file;
@ -94,6 +104,7 @@ def run_lekiwi(robot_config):
    """
    # Import helper functions and classes
    from lerobot.common.robot_devices.cameras.utils import make_cameras_from_configs
    from lerobot.common.robot_devices.microphones.utils import make_microphones_from_configs
    from lerobot.common.robot_devices.motors.feetech import FeetechMotorsBus, TorqueMode
    # Initialize cameras from the robot configuration.
@ -101,6 +112,11 @@ def run_lekiwi(robot_config):
    for cam in cameras.values():
        cam.connect()
    # Initialize microphones from the robot configuration.
    microphones = make_microphones_from_configs(robot_config.microphones)
    for microphone in microphones.values():
        microphone.connect()
    # Initialize the motors bus using the follower arm configuration.
    motor_config = robot_config.follower_arms.get("main")
    if motor_config is None:
@ -134,6 +150,20 @@ def run_lekiwi(robot_config):
    )
    cam_thread.start()
    # Start the microphone recording and capture thread.
    # TODO(CarolinePascal) : Leverage multi-core processing with a multiprocessing.Process instead !
    latest_audio_dict = {}
    audio_lock = threading.Lock()
    audio_stop_event = threading.Event()
    microphone_thread = threading.Thread(
        target=run_microphone_capture,
        args=(microphones, audio_lock, latest_audio_dict, audio_stop_event),
        daemon=True,
    )
    for microphone in microphones.values():
        microphone.start_recording()
    microphone_thread.start()
    last_cmd_time = time.time()
    print("LeKiwi robot server started. Waiting for commands...")
@ -198,9 +228,14 @@ def run_lekiwi(robot_config):
            with images_lock:
                images_dict_copy = dict(latest_images_dict)
            # Get the latest audio data.
            with audio_lock:
                audio_dict_copy = dict(latest_audio_dict)
            # Build the observation dictionary.
            observation = {
                "images": images_dict_copy,
                "audio": audio_dict_copy,  # TODO(CarolinePascal) : This is a nasty way to do it, sorry.
                "present_speed": current_velocity,
                "follower_arm_state": follower_arm_state,
            }
@ -217,6 +252,9 @@ def run_lekiwi(robot_config):
    finally:
        stop_event.set()
        cam_thread.join()
        microphone_thread.join()
        for microphone in microphones.values():
            microphone.stop_recording()
        robot.stop()
        motors_bus.disconnect()
        cmd_socket.close()
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@ -28,6 +28,7 @@ import numpy as np
 import torch
 from lerobot.common.robot_devices.cameras.utils import make_cameras_from_configs
 from lerobot.common.robot_devices.microphones.utils import make_microphones_from_configs
 from lerobot.common.robot_devices.motors.utils import MotorsBus, make_motors_buses_from_configs
 from lerobot.common.robot_devices.robots.configs import ManipulatorRobotConfig
 from lerobot.common.robot_devices.robots.utils import get_arm_id
@ -164,6 +165,7 @@ class ManipulatorRobot:
        self.leader_arms = make_motors_buses_from_configs(self.config.leader_arms)
        self.follower_arms = make_motors_buses_from_configs(self.config.follower_arms)
        self.cameras = make_cameras_from_configs(self.config.cameras)
        self.microphones = make_microphones_from_configs(self.config.microphones)
        self.is_connected = False
        self.logs = {}
@ -199,9 +201,24 @@ class ManipulatorRobot:
            },
        }
    @property
    def microphone_features(self) -> dict:
        mic_ft = {}
        for mic_key, mic in self.microphones.items():
            key = f"observation.audio.{mic_key}"
            mic_ft[key] = {
                "dtype": "audio",
                "shape": (len(mic.channels),),
                "names": "channels",
                "info": {
                    "sample_rate": mic.sample_rate
                },  # we need to store the sample rate here in the case of audio chunks recording (for LeKiwi), as it will not be available anymore when writing the audio file
            }
        return mic_ft
    @property
    def features(self):
-        return {**self.motor_features, **self.camera_features}
+        return {**self.motor_features, **self.camera_features, **self.microphone_features}
    @property
    def has_camera(self):
@ -211,6 +228,14 @@ class ManipulatorRobot:
    def num_cameras(self):
        return len(self.cameras)
    @property
    def has_microphone(self):
        return len(self.microphones) > 0
    @property
    def num_microphones(self):
        return len(self.microphones)
    @property
    def available_arms(self):
        available_arms = []
@ -228,7 +253,7 @@ class ManipulatorRobot:
                "ManipulatorRobot is already connected. Do not run `robot.connect()` twice."
            )
-        if not self.leader_arms and not self.follower_arms and not self.cameras:
+        if not self.leader_arms and not self.follower_arms and not self.cameras and not self.microphones:
            raise ValueError(
                "ManipulatorRobot doesn't have any device to connect. See example of usage in docstring of the class."
            )
@ -289,6 +314,10 @@ class ManipulatorRobot:
        for name in self.cameras:
            self.cameras[name].connect()
        # Connect the microphones
        for name in self.microphones:
            self.microphones[name].connect()
        self.is_connected = True
    def activate_calibration(self):
@ -514,12 +543,23 @@ class ManipulatorRobot:
            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
        # Capture audio from microphones
        audio = {}
        for name in self.microphones:
            before_audioread_t = time.perf_counter()
            audio[name] = self.microphones[name].read()
            audio[name] = torch.from_numpy(audio[name])
            self.logs[f"read_microphone_{name}_dt_s"] = self.microphones[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_microphone_{name}_dt_s"] = time.perf_counter() - before_audioread_t
        # Populate output dictionaries
        obs_dict, action_dict = {}, {}
        obs_dict["observation.state"] = state
        action_dict["action"] = action
        for name in self.cameras:
            obs_dict[f"observation.images.{name}"] = images[name]
        for name in self.microphones:
            obs_dict[f"observation.audio.{name}"] = audio[name]
        return obs_dict, action_dict
@ -554,11 +594,22 @@ class ManipulatorRobot:
            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
        # Capture audio from microphones
        audio = {}
        for name in self.microphones:
            before_audioread_t = time.perf_counter()
            audio[name] = self.microphones[name].read()
            audio[name] = torch.from_numpy(audio[name])
            self.logs[f"read_microphone_{name}_dt_s"] = self.microphones[name].logs["delta_timestamp_s"]
            self.logs[f"async_read_microphone_{name}_dt_s"] = time.perf_counter() - before_audioread_t
        # Populate output dictionaries and format to pytorch
        obs_dict = {}
        obs_dict["observation.state"] = state
        for name in self.cameras:
            obs_dict[f"observation.images.{name}"] = images[name]
        for name in self.microphones:
            obs_dict[f"observation.audio.{name}"] = audio[name]
        return obs_dict
    def send_action(self, action: torch.Tensor) -> torch.Tensor:
@ -620,6 +671,9 @@ class ManipulatorRobot:
        for name in self.cameras:
            self.cameras[name].disconnect()
        for name in self.microphones:
            self.microphones[name].disconnect()
        self.is_connected = False
    def __del__(self):
--- a/lerobot/common/robot_devices/robots/mobile_manipulator.py
+++ b/lerobot/common/robot_devices/robots/mobile_manipulator.py
@ -24,6 +24,7 @@ import torch
 import zmq
 from lerobot.common.robot_devices.cameras.utils import make_cameras_from_configs
 from lerobot.common.robot_devices.microphones.utils import make_microphones_from_configs
 from lerobot.common.robot_devices.motors.feetech import TorqueMode
 from lerobot.common.robot_devices.motors.utils import MotorsBus, make_motors_buses_from_configs
 from lerobot.common.robot_devices.robots.configs import LeKiwiRobotConfig
@ -79,6 +80,7 @@ class MobileManipulator:
        self.follower_arms = make_motors_buses_from_configs(self.config.follower_arms)
        self.cameras = make_cameras_from_configs(self.config.cameras)
        self.microphones = make_microphones_from_configs(self.config.microphones)
        self.is_connected = False
@ -133,6 +135,7 @@ class MobileManipulator:
                "shape": (cam.height, cam.width, cam.channels),
                "names": ["height", "width", "channels"],
                "info": None,
                "audio": "observation.audio." + cam.microphone if cam.microphone is not None else None,
            }
        return cam_ft
@ -161,9 +164,22 @@ class MobileManipulator:
            },
        }
    @property
    def microphone_features(self) -> dict:
        mic_ft = {}
        for mic_key, mic in self.microphones.items():
            key = f"observation.audio.{mic_key}"
            mic_ft[key] = {
                "dtype": "audio",
                "shape": (len(mic.channels),),
                "names": "channels",
                "info": {"sample_rate": mic.sample_rate},
            }
        return mic_ft
    @property
    def features(self):
-        return {**self.motor_features, **self.camera_features}
+        return {**self.motor_features, **self.camera_features, **self.microphone_features}
    @property
    def has_camera(self):
@ -173,6 +189,14 @@ class MobileManipulator:
    def num_cameras(self):
        return len(self.cameras)
    @property
    def has_microphone(self):
        return len(self.microphones) > 0
    @property
    def num_microphones(self):
        return len(self.microphones)
    @property
    def available_arms(self):
        available = []
@ -344,6 +368,7 @@ class MobileManipulator:
            observation = json.loads(last_msg)
            images_dict = observation.get("images", {})
            audio_dict = observation.get("audio", {})
            new_speed = observation.get("present_speed", {})
            new_arm_state = observation.get("follower_arm_state", None)
@ -356,6 +381,11 @@ class MobileManipulator:
                    if frame_candidate is not None:
                        frames[cam_name] = frame_candidate
            # Receive audio
            for microphone_name, audio_data in audio_dict.items():
                if audio_data:
                    frames[microphone_name] = audio_data
            # If remote_arm_state is None and frames is None there is no message then use the previous message
            if new_arm_state is not None and frames is not None:
                self.last_frames = frames
@ -475,6 +505,14 @@ class MobileManipulator:
                frame = np.zeros((cam.height, cam.width, cam.channels), dtype=np.uint8)
            obs_dict[f"observation.images.{cam_name}"] = torch.from_numpy(frame)
        # Loop over each configured microphone
        for microphone_name, microphone in self.microphones.items():
            frame = frames.get(microphone_name, None)
            if frame is None:
                # Create silence using the microphone's configured channels
                frame = np.zeros((1, len(microphone.channels)), dtype=np.float32)
            obs_dict[f"observation.audio.{microphone_name}"] = torch.from_numpy(frame)
        return obs_dict
    def send_action(self, action: torch.Tensor) -> torch.Tensor:
--- a/lerobot/common/robot_devices/utils.py
+++ b/lerobot/common/robot_devices/utils.py
@ -63,3 +63,25 @@ class RobotDeviceAlreadyConnectedError(Exception):
    ):
        self.message = message
        super().__init__(self.message)
 class RobotDeviceNotRecordingError(Exception):
    """Exception raised when the robot device is not recording."""
    def __init__(
        self,
        message="This robot device is not recording. Try calling `robot_device.start_recording()` first.",
    ):
        self.message = message
        super().__init__(self.message)
 class RobotDeviceAlreadyRecordingError(Exception):
    """Exception raised when the robot device is already recording."""
    def __init__(
        self,
        message="This robot device is already recording. Try not calling `robot_device.start_recording()` twice.",
    ):
        self.message = message
        super().__init__(self.message)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -67,8 +67,11 @@ dependencies = [
    "pynput>=1.7.7",
    "pyzmq>=26.2.1",
    "rerun-sdk>=0.21.0",
    "sounddevice>=0.5.1",
    "soundfile>=0.13.1",
    "termcolor>=2.4.0",
    "torch>=2.2.1",
    "torchaudio>=2.6.0",
    "torchcodec>=0.2.1; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')",
    "torchvision>=0.21.0",
    "wandb>=0.16.3",
@ -96,6 +99,7 @@ test = ["pytest>=8.1.0", "pytest-cov>=5.0.0", "pyserial>=3.5"]
 umi = ["imagecodecs>=2024.1.1"]
 video_benchmark = ["scikit-image>=0.23.2", "pandas>=2.2.2"]
 xarm = ["gym-xarm>=0.1.1 ; python_version < '4.0'"]
 audio = ["librosa>=0.11.0"]
 [tool.poetry]
 requires-poetry = ">=2.1"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -19,9 +19,9 @@ import traceback
 import pytest
 from serial import SerialException
-from lerobot import available_cameras, available_motors, available_robots
+from lerobot import available_cameras, available_microphones, available_motors, available_robots
 from lerobot.common.robot_devices.robots.utils import make_robot
-from tests.utils import DEVICE, make_camera, make_motors_bus
+from tests.utils import DEVICE, make_camera, make_microphone, make_motors_bus
 # Import fixture modules as plugins
 pytest_plugins = [
@ -74,6 +74,11 @@ def is_camera_available(camera_type):
    return _check_component_availability(camera_type, available_cameras, make_camera)
@pytest.fixture
 def is_microphone_available(microphone_type):
    return _check_component_availability(microphone_type, available_microphones, make_microphone)
@pytest.fixture
 def is_motor_available(motor_type):
    return _check_component_availability(motor_type, available_motors, make_motors_bus)
--- a/tests/datasets/test_compute_stats.py
+++ b/tests/datasets/test_compute_stats.py
@ -25,6 +25,8 @@ from lerobot.common.datasets.compute_stats import (
    compute_episode_stats,
    estimate_num_samples,
    get_feature_stats,
    sample_audio_from_data,
    sample_audio_from_path,
    sample_images,
    sample_indices,
 )
@ -34,6 +36,10 @@ def mock_load_image_as_numpy(path, dtype, channel_first):
    return np.ones((3, 32, 32), dtype=dtype) if channel_first else np.ones((32, 32, 3), dtype=dtype)
 def mock_load_audio(path):
    return np.ones((16000, 2), dtype=np.float32)
@pytest.fixture
 def sample_array():
    return np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
@ -71,6 +77,25 @@ def test_sample_images(mock_load):
    assert len(images) == estimate_num_samples(100)
@patch("lerobot.common.datasets.compute_stats.load_audio_from_path", side_effect=mock_load_audio)
 def test_sample_audio_from_path(mock_load):
    audio_path = "audio.wav"
    audio_samples = sample_audio_from_path(audio_path)
    assert isinstance(audio_samples, np.ndarray)
    assert audio_samples.shape[1] == 2
    assert audio_samples.dtype == np.float32
    assert len(audio_samples) == estimate_num_samples(16000)
 def test_sample_audio_from_data():
    audio_data = np.ones((16000, 2), dtype=np.float32)
    audio_samples = sample_audio_from_data(audio_data)
    assert isinstance(audio_samples, np.ndarray)
    assert audio_samples.shape[1] == 2
    assert audio_samples.dtype == np.float32
    assert len(audio_samples) == estimate_num_samples(16000)
 def test_get_feature_stats_images():
    data = np.random.rand(100, 3, 32, 32)
    stats = get_feature_stats(data, axis=(0, 2, 3), keepdims=True)
@ -79,6 +104,14 @@ def test_get_feature_stats_images():
    assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
 def test_get_feature_stats_audio():
    data = np.random.uniform(-1, 1, (16000, 2))
    stats = get_feature_stats(data, axis=0, keepdims=True)
    assert "min" in stats and "max" in stats and "mean" in stats and "std" in stats and "count" in stats
    np.testing.assert_equal(stats["count"], np.array([16000]))
    assert stats["min"].shape == stats["max"].shape == stats["mean"].shape == stats["std"].shape
 def test_get_feature_stats_axis_0_keepdims(sample_array):
    expected = {
        "min": np.array([[1, 2, 3]]),
@ -137,22 +170,29 @@ def test_get_feature_stats_single_value():
 def test_compute_episode_stats():
    episode_data = {
        "observation.image": [f"image_{i}.jpg" for i in range(100)],
        "observation.audio": "audio.wav",
        "observation.state": np.random.rand(100, 10),
    }
    features = {
        "observation.image": {"dtype": "image"},
        "observation.audio": {"dtype": "audio"},
        "observation.state": {"dtype": "numeric"},
    }
-    with patch(
+    with (
-        "lerobot.common.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy
+        patch(
            "lerobot.common.datasets.compute_stats.load_image_as_numpy", side_effect=mock_load_image_as_numpy
        ),
        patch("lerobot.common.datasets.compute_stats.load_audio_from_path", side_effect=mock_load_audio),
    ):
        stats = compute_episode_stats(episode_data, features)
-    assert "observation.image" in stats and "observation.state" in stats
+    assert "observation.image" in stats and "observation.state" in stats and "observation.audio" in stats
-    assert stats["observation.image"]["count"].item() == 100
+    assert stats["observation.image"]["count"].item() == estimate_num_samples(100)
-    assert stats["observation.state"]["count"].item() == 100
+    assert stats["observation.audio"]["count"].item() == estimate_num_samples(16000)
    assert stats["observation.state"]["count"].item() == estimate_num_samples(100)
    assert stats["observation.image"]["mean"].shape == (3, 1, 1)
    assert stats["observation.audio"]["mean"].shape == (1, 2)
 def test_assert_type_and_shape_valid():
--- a/tests/datasets/test_datasets.py
+++ b/tests/datasets/test_datasets.py
@ -16,6 +16,7 @@
 import json
 import logging
 import re
 import time
 from copy import deepcopy
 from itertools import chain
 from pathlib import Path
@ -35,6 +36,7 @@ from lerobot.common.datasets.lerobot_dataset import (
    MultiLeRobotDataset,
 )
 from lerobot.common.datasets.utils import (
    DEFAULT_AUDIO_CHUNK_DURATION,
    create_branch,
    flatten_dict,
    unflatten_dict,
@ -44,8 +46,8 @@ from lerobot.common.policies.factory import make_policy_config
 from lerobot.common.robot_devices.robots.utils import make_robot
 from lerobot.configs.default import DatasetConfig
 from lerobot.configs.train import TrainPipelineConfig
-from tests.fixtures.constants import DUMMY_CHW, DUMMY_HWC, DUMMY_REPO_ID
+from tests.fixtures.constants import DUMMY_AUDIO_CHANNELS, DUMMY_CHW, DUMMY_HWC, DUMMY_REPO_ID
-from tests.utils import require_x86_64_kernel
+from tests.utils import make_microphone, require_x86_64_kernel
@pytest.fixture
@ -64,6 +66,20 @@ def image_dataset(tmp_path, empty_lerobot_dataset_factory):
    return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
@pytest.fixture
 def audio_dataset(tmp_path, empty_lerobot_dataset_factory):
    features = {
        "observation.audio.microphone": {
            "dtype": "audio",
            "shape": (DUMMY_AUDIO_CHANNELS,),
            "names": [
                "channels",
            ],
        }
    }
    return empty_lerobot_dataset_factory(root=tmp_path / "test", features=features)
 def test_same_attributes_defined(tmp_path, lerobot_dataset_factory):
    """
    Instantiate a LeRobotDataset both ways with '__init__()' and 'create()' and verify that instantiated
@ -322,6 +338,24 @@ def test_image_array_to_pil_image_wrong_range_float_0_255():
        image_array_to_pil_image(image)
 def test_add_frame_audio(audio_dataset):
    dataset = audio_dataset
    microphone = make_microphone(microphone_type="microphone", mock=True)
    microphone.connect()
    dataset.add_microphone_recording(microphone, "microphone")
    time.sleep(1.0)
    dataset.add_frame({"observation.audio.microphone": microphone.read(), "task": "Dummy task"})
    microphone.stop_recording()
    dataset.save_episode()
    assert dataset[0]["observation.audio.microphone"].shape == torch.Size(
        (int(DEFAULT_AUDIO_CHUNK_DURATION * microphone.sample_rate), DUMMY_AUDIO_CHANNELS)
    )
 # TODO(aliberts):
 # - [ ] test various attributes & state from init and create
 # - [ ] test init with episodes and check num_frames
@ -354,6 +388,7 @@ def test_factory(env_name, repo_id, policy_name):
    dataset = make_dataset(cfg)
    delta_timestamps = dataset.delta_timestamps
    camera_keys = dataset.meta.camera_keys
    audio_keys = dataset.meta.audio_keys
    item = dataset[0]
@ -396,6 +431,11 @@ def test_factory(env_name, repo_id, policy_name):
                # test c,h,w
                assert item[key].shape[0] == 3, f"{key}"
        for key in audio_keys:
            assert item[key].dtype == torch.float32, f"{key}"
            assert item[key].max() <= 1.0, f"{key}"
            assert item[key].min() >= -1.0, f"{key}"
    if delta_timestamps is not None:
        # test missing keys in delta_timestamps
        for key in delta_timestamps:
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@ -29,7 +29,12 @@ DUMMY_MOTOR_FEATURES = {
    },
 }
 DUMMY_CAMERA_FEATURES = {
-    "laptop": {"shape": (480, 640, 3), "names": ["height", "width", "channels"], "info": None},
+    "laptop": {
        "shape": (480, 640, 3),
        "names": ["height", "width", "channels"],
        "info": None,
        "audio": "laptop",
    },
    "phone": {"shape": (480, 640, 3), "names": ["height", "width", "channels"], "info": None},
 }
 DEFAULT_FPS = 30
@ -40,5 +45,18 @@ DUMMY_VIDEO_INFO = {
    "video.is_depth_map": False,
    "has_audio": False,
 }
 DUMMY_MICROPHONE_FEATURES = {
    "laptop": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
    "phone": {"dtype": "audio", "shape": (1,), "names": ["channels"], "info": None},
 }
 DEFAULT_SAMPLE_RATE = 48000
 DUMMY_AUDIO_CHANNELS = 2
 DUMMY_AUDIO_INFO = {
    "has_audio": True,
    "audio.sample_rate": DEFAULT_SAMPLE_RATE,
    "audio.codec": "aac",
    "audio.channels": DUMMY_AUDIO_CHANNELS,
    "audio.channel_layout": "stereo",
 }
 DUMMY_CHW = (3, 96, 128)
 DUMMY_HWC = (96, 128, 3)
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@ -26,6 +26,7 @@ import torch
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset, LeRobotDatasetMetadata
 from lerobot.common.datasets.utils import (
    DEFAULT_CHUNK_SIZE,
    DEFAULT_COMPRESSED_AUDIO_PATH,
    DEFAULT_FEATURES,
    DEFAULT_PARQUET_PATH,
    DEFAULT_VIDEO_PATH,
@ -35,6 +36,7 @@ from lerobot.common.datasets.utils import (
 from tests.fixtures.constants import (
    DEFAULT_FPS,
    DUMMY_CAMERA_FEATURES,
    DUMMY_MICROPHONE_FEATURES,
    DUMMY_MOTOR_FEATURES,
    DUMMY_REPO_ID,
    DUMMY_ROBOT_TYPE,
@ -90,6 +92,7 @@ def features_factory():
    def _create_features(
        motor_features: dict = DUMMY_MOTOR_FEATURES,
        camera_features: dict = DUMMY_CAMERA_FEATURES,
        audio_features: dict = DUMMY_MICROPHONE_FEATURES,
        use_videos: bool = True,
    ) -> dict:
        if use_videos:
@ -101,6 +104,7 @@ def features_factory():
        return {
            **motor_features,
            **camera_ft,
            **audio_features,
            **DEFAULT_FEATURES,
        }
@ -117,15 +121,18 @@ def info_factory(features_factory):
        total_frames: int = 0,
        total_tasks: int = 0,
        total_videos: int = 0,
        total_audio: int = 0,
        total_chunks: int = 0,
        chunks_size: int = DEFAULT_CHUNK_SIZE,
        data_path: str = DEFAULT_PARQUET_PATH,
        video_path: str = DEFAULT_VIDEO_PATH,
        audio_path: str = DEFAULT_COMPRESSED_AUDIO_PATH,
        motor_features: dict = DUMMY_MOTOR_FEATURES,
        camera_features: dict = DUMMY_CAMERA_FEATURES,
        audio_features: dict = DUMMY_MICROPHONE_FEATURES,
        use_videos: bool = True,
    ) -> dict:
-        features = features_factory(motor_features, camera_features, use_videos)
+        features = features_factory(motor_features, camera_features, audio_features, use_videos)
        return {
            "codebase_version": codebase_version,
            "robot_type": robot_type,
@ -133,12 +140,14 @@ def info_factory(features_factory):
            "total_frames": total_frames,
            "total_tasks": total_tasks,
            "total_videos": total_videos,
            "total_audio": total_audio,
            "total_chunks": total_chunks,
            "chunks_size": chunks_size,
            "fps": fps,
            "splits": {},
            "data_path": data_path,
            "video_path": video_path if use_videos else None,
            "audio_path": audio_path,
            "features": features,
        }
@ -162,6 +171,14 @@ def stats_factory():
                    "std": np.full((3, 1, 1), 0.25, dtype=np.float32).tolist(),
                    "count": [10],
                }
            elif dtype == "audio":
                stats[key] = {
                    "mean": np.full((shape[0],), 0.0, dtype=np.float32).tolist(),
                    "max": np.full((shape[0],), 1, dtype=np.float32).tolist(),
                    "min": np.full((shape[0],), -1, dtype=np.float32).tolist(),
                    "std": np.full((shape[0],), 0.5, dtype=np.float32).tolist(),
                    "count": [10],
                }
            else:
                stats[key] = {
                    "max": np.full(shape, 1, dtype=dtype).tolist(),
--- a/tests/microphones/mock_sounddevice.py
+++ b/tests/microphones/mock_sounddevice.py
@ -0,0 +1,88 @@
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
 from functools import cache
 from threading import Event, Thread
 import numpy as np
 from lerobot.common.utils.utils import capture_timestamp_utc
 from tests.fixtures.constants import DEFAULT_SAMPLE_RATE, DUMMY_AUDIO_CHANNELS
@cache
 def _generate_sound(duration: float, sample_rate: int, channels: int):
    return np.random.uniform(-1, 1, size=(int(duration * sample_rate), channels)).astype(np.float32)
 def query_devices(query_index: int):
    return {
        "name": "Mock Sound Device",
        "index": query_index,
        "max_input_channels": DUMMY_AUDIO_CHANNELS,
        "default_samplerate": DEFAULT_SAMPLE_RATE,
    }
 class InputStream:
    def __init__(self, *args, **kwargs):
        self._mock_dict = {
            "channels": DUMMY_AUDIO_CHANNELS,
            "samplerate": DEFAULT_SAMPLE_RATE,
        }
        self._is_active = False
        self._audio_callback = kwargs.get("callback")
        self.callback_thread = None
        self.callback_thread_stop_event = None
    def _acquisition_loop(self):
        if self._audio_callback is not None:
            while not self.callback_thread_stop_event.is_set():
                # Simulate audio data acquisition
                time.sleep(0.01)
                self._audio_callback(
                    _generate_sound(0.01, DEFAULT_SAMPLE_RATE, DUMMY_AUDIO_CHANNELS),
                    0.01 * DEFAULT_SAMPLE_RATE,
                    capture_timestamp_utc(),
                    None,
                )
    def start(self):
        self.callback_thread_stop_event = Event()
        self.callback_thread = Thread(target=self._acquisition_loop, args=())
        self.callback_thread.daemon = True
        self.callback_thread.start()
        self._is_active = True
    @property
    def active(self):
        return self._is_active
    def stop(self):
        if self.callback_thread_stop_event is not None:
            self.callback_thread_stop_event.set()
            self.callback_thread.join()
            self.callback_thread = None
            self.callback_thread_stop_event = None
        self._is_active = False
    def close(self):
        if self._is_active:
            self.stop()
    def __del__(self):
        if self._is_active:
            self.stop()
--- a/tests/microphones/test_microphones.py
+++ b/tests/microphones/test_microphones.py
@ -0,0 +1,152 @@
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Tests for physical microphones and their mocked versions.
 If the physical microphone is not connected to the computer, or not working,
 the test will be skipped.
 Example of running a specific test:
 ```bash
 pytest -sx tests/microphones/test_microphones.py::test_microphone
 ```
 Example of running test on a real microphone connected to the computer:
 ```bash
 pytest -sx 'tests/microphones/test_microphones.py::test_microphone[microphone-False]'
 ```
 Example of running test on a mocked version of the microphone:
 ```bash
 pytest -sx 'tests/microphones/test_microphones.py::test_microphone[microphone-True]'
 ```
 """
 import time
 import numpy as np
 import pytest
 from soundfile import read
 from lerobot.common.robot_devices.utils import (
    RobotDeviceAlreadyConnectedError,
    RobotDeviceAlreadyRecordingError,
    RobotDeviceNotConnectedError,
    RobotDeviceNotRecordingError,
 )
 from tests.utils import TEST_MICROPHONE_TYPES, make_microphone, require_microphone
 # Maximum recording tie difference between two consecutive audio recordings of the same duration.
 # Set to 0.02 seconds as twice the default size of sounddvice callback buffer (i.e. we tolerate the loss of one buffer).
 MAX_RECORDING_TIME_DIFFERENCE = 0.02
 DUMMY_RECORDING = "test_recording.wav"
@pytest.mark.parametrize("microphone_type, mock", TEST_MICROPHONE_TYPES)
@require_microphone
 def test_microphone(tmp_path, request, microphone_type, mock):
    """Test assumes that a recroding handled with microphone.start_recording(output_file) and stop_recording() or microphone.read()
    leqds to a sample that does not differ from the requested duration by more than 0.1 seconds.
    """
    microphone_kwargs = {"microphone_type": microphone_type, "mock": mock}
    # Test instantiating
    microphone = make_microphone(**microphone_kwargs)
    # Test start_recording, stop_recording, read and disconnecting before connecting raises an error
    with pytest.raises(RobotDeviceNotConnectedError):
        microphone.start_recording()
    with pytest.raises(RobotDeviceNotConnectedError):
        microphone.stop_recording()
    with pytest.raises(RobotDeviceNotConnectedError):
        microphone.read()
    with pytest.raises(RobotDeviceNotConnectedError):
        microphone.disconnect()
    # Test deleting the object without connecting first
    del microphone
    # Test connecting
    microphone = make_microphone(**microphone_kwargs)
    microphone.connect()
    assert microphone.is_connected
    assert microphone.sample_rate is not None
    assert microphone.channels is not None
    # Test connecting twice raises an error
    with pytest.raises(RobotDeviceAlreadyConnectedError):
        microphone.connect()
    # Test reading or stop recording before starting recording raises an error
    with pytest.raises(RobotDeviceNotRecordingError):
        microphone.read()
    with pytest.raises(RobotDeviceNotRecordingError):
        microphone.stop_recording()
    # Test start_recording
    fpath = tmp_path / DUMMY_RECORDING
    microphone.start_recording(fpath)
    assert microphone.is_recording
    # Test start_recording twice raises an error
    with pytest.raises(RobotDeviceAlreadyRecordingError):
        microphone.start_recording()
    # Test reading from the microphone
    time.sleep(1.0)
    audio_chunk = microphone.read()
    assert isinstance(audio_chunk, np.ndarray)
    assert audio_chunk.ndim == 2
    _, c = audio_chunk.shape
    assert c == len(microphone.channels)
    # Test stop_recording
    microphone.stop_recording()
    assert fpath.exists()
    assert not microphone.stream.active
    assert microphone.record_thread is None
    # Test stop_recording twice raises an error
    with pytest.raises(RobotDeviceNotRecordingError):
        microphone.stop_recording()
    # Test reading and recording output similar length audio chunks
    microphone.start_recording(tmp_path / DUMMY_RECORDING)
    time.sleep(1.0)
    audio_chunk = microphone.read()
    microphone.stop_recording()
    recorded_audio, recorded_sample_rate = read(fpath)
    assert recorded_sample_rate == microphone.sample_rate
    error_msg = (
        "Recording time difference between read() and stop_recording()",
        (len(audio_chunk) - len(recorded_audio)) / MAX_RECORDING_TIME_DIFFERENCE,
    )
    np.testing.assert_allclose(
        len(audio_chunk),
        len(recorded_audio),
        atol=recorded_sample_rate * MAX_RECORDING_TIME_DIFFERENCE,
        err_msg=error_msg,
    )
    # Test disconnecting
    microphone.disconnect()
    assert not microphone.is_connected
    # Test disconnecting with `__del__`
    microphone = make_microphone(**microphone_kwargs)
    microphone.connect()
    del microphone
--- a/tests/robots/test_robots.py
+++ b/tests/robots/test_robots.py
@ -100,6 +100,9 @@ def test_robot(tmp_path, request, robot_type, mock):
    robot.teleop_step()
    # Test data recorded during teleop are well formatted
    for _, microphone in robot.microphones.items():
        microphone.start_recording()
    observation, action = robot.teleop_step(record_data=True)
    # State
    assert "observation.state" in observation
@ -112,6 +115,11 @@ def test_robot(tmp_path, request, robot_type, mock):
        assert f"observation.images.{name}" in observation
        assert isinstance(observation[f"observation.images.{name}"], torch.Tensor)
        assert observation[f"observation.images.{name}"].ndim == 3
    # Microphones
    for name in robot.microphones:
        assert f"observation.audio.{name}" in observation
        assert isinstance(observation[f"observation.audio.{name}"], torch.Tensor)
        assert observation[f"observation.audio.{name}"].ndim == 2
    # Action
    assert "action" in action
    assert isinstance(action["action"], torch.Tensor)
@ -124,8 +132,9 @@ def test_robot(tmp_path, request, robot_type, mock):
    captured_observation = robot.capture_observation()
    assert set(captured_observation.keys()) == set(observation.keys())
    for name in captured_observation:
-        if "image" in name:
+        if "image" in name or "audio" in name:
            # TODO(rcadene): skipping image for now as it's challenging to assess equality between two consecutive frames
            # Also skipping for audio as audio chunks may be of different length
            continue
        torch.testing.assert_close(captured_observation[name], observation[name], rtol=1e-4, atol=1)
        assert captured_observation[name].shape == observation[name].shape
@ -134,7 +143,7 @@ def test_robot(tmp_path, request, robot_type, mock):
    robot.send_action(action["action"])
    # Test disconnecting
-    robot.disconnect()
+    robot.disconnect()  # Also handles microphone recording stop, life is beautiful
    assert not robot.is_connected
    for name in robot.follower_arms:
        assert not robot.follower_arms[name].is_connected
@ -142,3 +151,5 @@ def test_robot(tmp_path, request, robot_type, mock):
        assert not robot.leader_arms[name].is_connected
    for name in robot.cameras:
        assert not robot.cameras[name].is_connected
    for name in robot.microphones:
        assert not robot.microphones[name].is_connected
--- a/tests/utils.py
+++ b/tests/utils.py
@ -22,9 +22,11 @@ from pathlib import Path
 import pytest
 import torch
-from lerobot import available_cameras, available_motors, available_robots
+from lerobot import available_cameras, available_microphones, available_motors, available_robots
 from lerobot.common.robot_devices.cameras.utils import Camera
 from lerobot.common.robot_devices.cameras.utils import make_camera as make_camera_device
 from lerobot.common.robot_devices.microphones.utils import Microphone
 from lerobot.common.robot_devices.microphones.utils import make_microphone as make_microphone_device
 from lerobot.common.robot_devices.motors.utils import MotorsBus
 from lerobot.common.robot_devices.motors.utils import make_motors_bus as make_motors_bus_device
 from lerobot.common.utils.import_utils import is_package_available
@ -39,6 +41,10 @@ TEST_CAMERA_TYPES = []
 for camera_type in available_cameras:
    TEST_CAMERA_TYPES += [(camera_type, True), (camera_type, False)]
 TEST_MICROPHONE_TYPES = []
 for microphone_type in available_microphones:
    TEST_MICROPHONE_TYPES += [(microphone_type, True), (microphone_type, False)]
 TEST_MOTOR_TYPES = []
 for motor_type in available_motors:
    TEST_MOTOR_TYPES += [(motor_type, True), (motor_type, False)]
@ -47,6 +53,9 @@ for motor_type in available_motors:
 OPENCV_CAMERA_INDEX = int(os.environ.get("LEROBOT_TEST_OPENCV_CAMERA_INDEX", 0))
 INTELREALSENSE_SERIAL_NUMBER = int(os.environ.get("LEROBOT_TEST_INTELREALSENSE_SERIAL_NUMBER", 128422271614))
 # Microphone indices used for connecting physical microphones
 MICROPHONE_INDEX = int(os.environ.get("LEROBOT_TEST_MICROPHONE_INDEX", 0))
 DYNAMIXEL_PORT = os.environ.get("LEROBOT_TEST_DYNAMIXEL_PORT", "/dev/tty.usbmodem575E0032081")
 DYNAMIXEL_MOTORS = {
    "shoulder_pan": [1, "xl430-w250"],
@ -253,6 +262,29 @@ def require_camera(func):
    return wrapper
 def require_microphone(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        # Access the pytest request context to get the is_microphone_available fixture
        request = kwargs.get("request")
        microphone_type = kwargs.get("microphone_type")
        mock = kwargs.get("mock")
        if request is None:
            raise ValueError("The 'request' fixture must be an argument of the test function.")
        if microphone_type is None:
            raise ValueError("The 'microphone_type' must be an argument of the test function.")
        if mock is None:
            raise ValueError("The 'mock' variable must be an argument of the test function.")
        if not mock and not request.getfixturevalue("is_microphone_available"):
            pytest.skip(f"A {microphone_type} microphone is not available.")
        return func(*args, **kwargs)
    return wrapper
 def require_motor(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
@ -315,6 +347,14 @@ def make_camera(camera_type: str, **kwargs) -> Camera:
        raise ValueError(f"The camera type '{camera_type}' is not valid.")
 def make_microphone(microphone_type: str, **kwargs) -> Microphone:
    if microphone_type == "microphone":
        microphone_index = kwargs.pop("microphone_index", MICROPHONE_INDEX)
        return make_microphone_device(microphone_type, microphone_index=microphone_index, **kwargs)
    else:
        raise ValueError(f"The microphone type '{microphone_type}' is not valid.")
 # TODO(rcadene, aliberts): remove this dark pattern that overrides
 def make_motors_bus(motor_type: str, **kwargs) -> MotorsBus:
    if motor_type == "dynamixel":