diff --git a/lerobot/common/datasets/compute_stats.py b/lerobot/common/datasets/compute_stats.py index 36606719..2fab5a80 100644 --- a/lerobot/common/datasets/compute_stats.py +++ b/lerobot/common/datasets/compute_stats.py @@ -73,6 +73,7 @@ def sample_images(image_paths: list[str]) -> np.ndarray: def sample_audio_from_path(audio_path: str) -> np.ndarray: + """Samples audio data from an audio recording stored in a WAV file.""" data = load_audio_from_path(audio_path) sampled_indices = sample_indices(len(data)) @@ -80,6 +81,7 @@ def sample_audio_from_path(audio_path: str) -> np.ndarray: def sample_audio_from_data(data: np.ndarray) -> np.ndarray: + """Samples audio data from an audio recording stored in a numpy array.""" sampled_indices = sample_indices(len(data)) return data[sampled_indices] @@ -106,7 +108,7 @@ def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], featu elif features[key]["dtype"] == "audio": try: ep_ft_array = sample_audio_from_path(data[0]) - except TypeError: # Should only be triggered for LeKiwi robot + except TypeError: # Should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner ep_ft_array = sample_audio_from_data(data) axes_to_reduce = 0 keepdims = True diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py index da5874fb..e51a163d 100644 --- a/lerobot/common/datasets/lerobot_dataset.py +++ b/lerobot/common/datasets/lerobot_dataset.py @@ -150,6 +150,7 @@ class LeRobotDatasetMetadata: return Path(fpath) def get_compressed_audio_file_path(self, episode_index: int, audio_key: str) -> Path: + """Returns the path of the compressed (i.e. encoded) audio file.""" episode_chunk = self.get_episode_chunk(episode_index) fpath = self.audio_path.format( episode_chunk=episode_chunk, audio_key=audio_key, episode_index=episode_index @@ -206,7 +207,7 @@ class LeRobotDatasetMetadata: @property def audio_keys(self) -> list[str]: - """Keys to access audio modalities (wether they are linked to a camera or not).""" + """Keys to access audio modalities (whether they are linked to a camera or not).""" return [key for key, ft in self.features.items() if ft["dtype"] == "audio"] @property @@ -223,7 +224,7 @@ class LeRobotDatasetMetadata: def linked_audio_keys(self) -> list[str]: """Keys to access audio modalities linked to a camera.""" return [key for key in self.audio_keys if key in self.audio_camera_keys_mapping] - + @property def unlinked_audio_keys(self) -> list[str]: """Keys to access audio modalities not linked to a camera.""" @@ -342,9 +343,10 @@ class LeRobotDatasetMetadata: been encoded the same way. Also, this means it assumes the first episode exists. """ for key in self.unlinked_audio_keys: - if not self.features[key].get("info", None) or ( - len(self.features[key]["info"]) == 1 and "sample_rate" in self.features[key]["info"] - ): #Overwrite if info is empty or only contains sample rate (necessary to correctly save audio files recorded by LeKiwi) + if ( + not self.features[key].get("info", None) + or (len(self.features[key]["info"]) == 1 and "sample_rate" in self.features[key]["info"]) + ): # Overwrite if info is empty or only contains sample rate (necessary to correctly save audio files recorded by LeKiwi) audio_path = self.root / self.get_compressed_audio_file_path(0, key) self.info["features"][key]["info"] = get_audio_info(audio_path) @@ -568,9 +570,8 @@ class LeRobotDataset(torch.utils.data.Dataset): except (AssertionError, FileNotFoundError, NotADirectoryError): self.revision = get_safe_version(self.repo_id, self.revision) self.download_episodes( - download_videos, - download_audio - ) #Audio embedded in video files (.mp4) will be downloaded if download_videos is set to True, regardless of the value of download_audio + download_videos, download_audio + ) # Audio embedded in video files (.mp4) will be downloaded if download_videos is set to True, regardless of the value of download_audio self.hf_dataset = self.load_hf_dataset() self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes) @@ -581,6 +582,8 @@ class LeRobotDataset(torch.utils.data.Dataset): ep_data_index_np = {k: t.numpy() for k, t in self.episode_data_index.items()} check_timestamps_sync(timestamps, episode_indices, ep_data_index_np, self.fps, self.tolerance_s) + # TODO(CarolinePascal) : add check for audio duration with respect to video duration and episode duration. + # Setup delta_indices if self.delta_timestamps is not None: check_delta_timestamps(self.delta_timestamps, self.fps, self.tolerance_s) @@ -601,7 +604,9 @@ class LeRobotDataset(torch.utils.data.Dataset): ) -> None: ignore_patterns = ["images/"] if not push_videos: - ignore_patterns.append("videos/") #Audio embedded in video files (.mp4) will be automatically pushed if videos are pushed + ignore_patterns.append( + "videos/" + ) # Audio embedded in video files (.mp4) will be automatically pushed if videos are pushed if not push_audio: ignore_patterns.append("audio/") @@ -670,7 +675,9 @@ class LeRobotDataset(torch.utils.data.Dataset): files = None ignore_patterns = [] if not download_videos: - ignore_patterns.append("videos/") #Audio embedded in video files (.mp4) will be automatically downloaded if videos are downloaded + ignore_patterns.append( + "videos/" + ) # Audio embedded in video files (.mp4) will be automatically downloaded if videos are downloaded if not download_audio: ignore_patterns.append("audio/") if self.episodes is not None: @@ -785,7 +792,7 @@ class LeRobotDataset(torch.utils.data.Dataset): query_indices: dict[str, list[int]] | None = None, ) -> dict[str, list[float]]: query_timestamps = {} - for key in self.meta.audio_keys: #Standalone audio and audio embedded in video as well ! + for key in self.meta.audio_keys: # Standalone audio and audio embedded in video as well ! if query_indices is not None and key in query_indices: timestamps = self.hf_dataset.select(query_indices[key])["timestamp"] query_timestamps[key] = torch.stack(timestamps).tolist() @@ -821,12 +828,12 @@ class LeRobotDataset(torch.utils.data.Dataset): ) -> dict[str, torch.Tensor]: item = {} for audio_key, query_ts in query_timestamps.items(): - #Audio stored with video in a single .mp4 file + # Audio stored with video in a single .mp4 file if audio_key in self.meta.linked_audio_keys: audio_path = self.root / self.meta.get_video_file_path( ep_idx, self.meta.audio_camera_keys_mapping[audio_key] ) - #Audio stored alone in a separate .m4a file + # Audio stored alone in a separate .m4a file else: audio_path = self.root / self.meta.get_compressed_audio_file_path(ep_idx, audio_key) audio_chunk = decode_audio(audio_path, query_ts, query_duration, self.audio_backend) @@ -957,9 +964,11 @@ class LeRobotDataset(torch.utils.data.Dataset): self._save_image(frame[key], img_path) self.episode_buffer[key].append(str(img_path)) elif self.features[key]["dtype"] == "audio": - if self.meta.robot_type.startswith("lekiwi"): + if self.meta.robot_type.startswith( + "lekiwi" + ): # Rw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner self.episode_buffer[key].append(frame[key]) - else: + else: # Otherwise, only the audio file path is stored in the episode buffer if frame_index == 0: audio_path = self._get_raw_audio_file_path( episode_index=self.episode_buffer["episode_index"], audio_key=key @@ -972,7 +981,7 @@ class LeRobotDataset(torch.utils.data.Dataset): def add_microphone_recording(self, microphone: Microphone, microphone_key: str) -> None: """ - This function will start recording audio from the microphone and save it to disk. + Starts recording audio data provided by the microphone and directly writes it in a .wav file. """ audio_dir = self._get_raw_audio_file_path( @@ -1025,7 +1034,9 @@ class LeRobotDataset(torch.utils.data.Dataset): if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["image", "video"]: continue elif ft["dtype"] == "audio": - if self.meta.robot_type.startswith("lekiwi"): + if self.meta.robot_type.startswith( + "lekiwi" + ): # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner episode_buffer[key] = np.concatenate(episode_buffer[key], axis=0) continue episode_buffer[key] = np.stack(episode_buffer[key]) @@ -1033,7 +1044,9 @@ class LeRobotDataset(torch.utils.data.Dataset): self._wait_image_writer() self._save_episode_table(episode_buffer, episode_index) - if self.meta.robot_type.startswith("lekiwi"): + if self.meta.robot_type.startswith( + "lekiwi" + ): # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner for key in self.meta.audio_keys: audio_path = self._get_raw_audio_file_path( episode_index=self.episode_buffer["episode_index"][0], audio_key=key @@ -1053,7 +1066,7 @@ class LeRobotDataset(torch.utils.data.Dataset): for key in self.meta.video_keys: episode_buffer[key] = video_paths[key] - if len(self.meta.unlinked_audio_keys) > 0: #Linked audio is already encoded in the video files + if len(self.meta.unlinked_audio_keys) > 0: # Linked audio is already encoded in the video files _ = self.encode_episode_audio(episode_index) # `meta.save_episode` be executed after encoding the videos @@ -1080,7 +1093,7 @@ class LeRobotDataset(torch.utils.data.Dataset): if img_dir.is_dir(): shutil.rmtree(self.root / "images") - # delete raw audio + # delete raw audio files raw_audio_files = list(self.root.rglob("*.wav")) for raw_audio_file in raw_audio_files: raw_audio_file.unlink() diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index 0511610e..d1b25023 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -52,14 +52,14 @@ def decode_audio( Decodes audio using the specified backend. Args: audio_path (Path): Path to the audio file. - timestamps (list[float]): List of timestamps to extract frames. - tolerance_s (float): Allowed deviation in seconds for frame retrieval. - backend (str, optional): Backend to use for decoding. Defaults to "pyav". + timestamps (list[float]): List of (starting) timestamps to extract audio chunks. + duration (float): Duration of the audio chunks in seconds. + backend (str, optional): Backend to use for decoding. Defaults to "ffmpeg". Returns: - torch.Tensor: Decoded frames. + torch.Tensor: Decoded audio chunks. - Currently supports pyav. + Currently supports ffmpeg. """ if backend == "torchcodec": raise NotImplementedError("torchcodec is not yet supported for audio decoding") @@ -82,7 +82,6 @@ def decode_audio_torchvision( audio_sample_rate = reader.get_src_stream_info(reader.default_audio_stream).sample_rate # TODO(CarolinePascal) : sort timestamps ? - reader.add_basic_audio_stream( frames_per_chunk=int(ceil(duration * audio_sample_rate)), # Too much is better than not enough buffer_chunk_size=-1, # No dropping frames @@ -317,7 +316,7 @@ def decode_video_frames_torchcodec( def encode_audio( input_path: Path | str, output_path: Path | str, - codec: str = "aac", + codec: str = "aac", # TODO(CarolinePascal) : investigate Fraunhofer FDK AAC (libfdk_aac) codec and and constant (file size control) /variable (quality control) bitrate options log_level: str | None = "error", overwrite: bool = False, ) -> None: @@ -346,7 +345,7 @@ def encode_audio( if not output_path.exists(): raise OSError( - f"Video encoding did not work. File not found: {output_path}. " + f"Audio encoding did not work. File not found: {output_path}. " f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`" ) diff --git a/lerobot/common/robot_devices/microphones/microphone.py b/lerobot/common/robot_devices/microphones/microphone.py index 947fdfea..b08842c2 100644 --- a/lerobot/common/robot_devices/microphones/microphone.py +++ b/lerobot/common/robot_devices/microphones/microphone.py @@ -44,6 +44,10 @@ from lerobot.common.utils.utils import capture_timestamp_utc def find_microphones(raise_when_empty=False, mock=False) -> list[dict]: + """ + Finds and lists all microphones compatible with sounddevice (and the underlying PortAudio library). + Most microphones and sound cards are compatible, across all OS (Linux, Mac, Windows). + """ microphones = [] if mock: @@ -72,6 +76,11 @@ def find_microphones(raise_when_empty=False, mock=False) -> list[dict]: def record_audio_from_microphones( output_dir: Path, microphone_ids: list[int] | None = None, record_time_s: float = 2.0 ): + """ + Records audio from all the channels of the specified microphones for the specified duration. + If no microphone ids are provided, all available microphones will be used. + """ + if microphone_ids is None or len(microphone_ids) == 0: microphones = find_microphones() microphone_ids = [m["index"] for m in microphones] @@ -112,7 +121,7 @@ def record_audio_from_microphones( class Microphone: """ - The Microphone class handles all microphones compatible with sounddevice (and the underlying PortAudio library). Most microphones and sound cards are compatible, accross all OS (Linux, Mac, Windows). + The Microphone class handles all microphones compatible with sounddevice (and the underlying PortAudio library). Most microphones and sound cards are compatible, across all OS (Linux, Mac, Windows). A Microphone instance requires the sounddevice index of the microphone, which may be obtained using `python -m sounddevice`. It also requires the recording sample rate as well as the list of recorded channels. @@ -146,11 +155,11 @@ class Microphone: # Input audio stream self.stream = None - # Thread-safe concurrent queue to store the recorded/read audio + # Thread/Process-safe concurrent queue to store the recorded/read audio self.record_queue = None self.read_queue = None - # Thread to handle data reading and file writing in a separate thread (safely) + # Thread/Process to handle data reading and file writing in a separate thread/process (safely) self.record_thread = None self.record_stop_event = None @@ -160,6 +169,9 @@ class Microphone: self.is_writing = False def connect(self) -> None: + """ + Connects the microphone and checks if the requested acquisition parameters are compatible with the microphone. + """ if self.is_connected: raise RobotDeviceAlreadyConnectedError( f"Microphone {self.microphone_index} is already connected." @@ -214,15 +226,18 @@ class Microphone: dtype="float32", callback=self._audio_callback, ) - # Remark : the blocksize parameter could be passed to the stream to ensure that audio_callback always recieve same length buffers. - # However, this may lead to additionnal latency. We thus stick to blocksize=0 which means that audio_callback will recieve varying length buffers, but with no addtional latency. + # Remark : the blocksize parameter could be passed to the stream to ensure that audio_callback always receive same length buffers. + # However, this may lead to additional latency. We thus stick to blocksize=0 which means that audio_callback will receive varying length buffers, but with no additional latency. self.is_connected = True def _audio_callback(self, indata, frames, time, status) -> None: + """ + Low-level sounddevice callback. + """ if status: logging.warning(status) - # Slicing makes copy unecessary + # Slicing makes copy unnecessary # Two separate queues are necessary because .get() also pops the data from the queue if self.is_writing: self.record_queue.put(indata[:, self.channels]) @@ -230,6 +245,9 @@ class Microphone: @staticmethod def _record_loop(queue, event: Event, sample_rate: int, channels: list[int], output_file: Path) -> None: + """ + Thread/Process-safe loop to write audio data into a file. + """ # Can only be run on a single process/thread for file writing safety with sf.SoundFile( output_file, @@ -249,9 +267,7 @@ class Microphone: def _read(self) -> np.ndarray: """ - Gets audio data from the queue and coverts it to a numpy array. - -> PROS : Inherently thread safe, no need to lock the queue, lightweight CPU usage - -> CONS : Reading duration does not scale well with the number of channels and reading duration + Thread/Process-safe callback to read available audio data """ audio_readings = np.empty((0, len(self.channels))) @@ -266,6 +282,9 @@ class Microphone: return audio_readings def read(self) -> np.ndarray: + """ + Reads the last audio chunk recorded by the microphone, e.g. all samples recorded since the last read or since the beginning of the recording. + """ if not self.is_connected: raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.") if not self.is_recording: @@ -284,6 +303,9 @@ class Microphone: return audio_readings def start_recording(self, output_file: str | None = None, multiprocessing: bool | None = False) -> None: + """ + Starts the recording of the microphone. If output_file is provided, the audio will be written to this file. + """ if not self.is_connected: raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.") if self.is_recording: @@ -337,6 +359,9 @@ class Microphone: self.stream.start() def stop_recording(self) -> None: + """ + Stops the recording of the microphones. + """ if not self.is_connected: raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.") if not self.is_recording: @@ -356,6 +381,9 @@ class Microphone: self.is_writing = False def disconnect(self) -> None: + """ + Disconnects the microphone and stops the recording. + """ if not self.is_connected: raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.") @@ -385,7 +413,7 @@ if __name__ == "__main__": "--output-dir", type=Path, default="outputs/audio_from_microphones", - help="Set directory to save an audio snipet for each microphone.", + help="Set directory to save an audio snippet for each microphone.", ) parser.add_argument( "--record-time-s", diff --git a/lerobot/common/robot_devices/robots/mobile_manipulator.py b/lerobot/common/robot_devices/robots/mobile_manipulator.py index 7727abb9..4af008ed 100644 --- a/lerobot/common/robot_devices/robots/mobile_manipulator.py +++ b/lerobot/common/robot_devices/robots/mobile_manipulator.py @@ -381,7 +381,7 @@ class MobileManipulator: if frame_candidate is not None: frames[cam_name] = frame_candidate - # Recieve audio + # Receive audio for microphone_name, audio_data in audio_dict.items(): if audio_data: frames[microphone_name] = audio_data