diff --git a/lerobot/common/datasets/audio_utils.py b/lerobot/common/datasets/audio_utils.py
new file mode 100644
index 00000000..901fad52
--- /dev/null
+++ b/lerobot/common/datasets/audio_utils.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import subprocess
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+import torchaudio
+from numpy import ceil
+
+
+def decode_audio(
+    audio_path: Path | str,
+    timestamps: list[float],
+    duration: float,
+    backend: str | None = "ffmpeg",
+) -> torch.Tensor:
+    """
+    Decodes audio using the specified backend.
+    Args:
+        audio_path (Path): Path to the audio file.
+        timestamps (list[float]): List of (starting) timestamps to extract audio chunks.
+        duration (float): Duration of the audio chunks in seconds.
+        backend (str, optional): Backend to use for decoding. Defaults to "ffmpeg".
+
+    Returns:
+        torch.Tensor: Decoded audio chunks.
+
+    Currently supports ffmpeg.
+    """
+    if backend == "torchcodec":
+        raise NotImplementedError("torchcodec is not yet supported for audio decoding")
+    elif backend == "ffmpeg":
+        return decode_audio_torchaudio(audio_path, timestamps, duration)
+    else:
+        raise ValueError(f"Unsupported video backend: {backend}")
+
+
+def decode_audio_torchaudio(
+    audio_path: Path | str,
+    timestamps: list[float],
+    duration: float,
+    log_loaded_timestamps: bool = False,
+) -> torch.Tensor:
+    # TODO(CarolinePascal) : add channels selection
+    audio_path = str(audio_path)
+
+    reader = torchaudio.io.StreamReader(src=audio_path)
+    audio_sample_rate = reader.get_src_stream_info(reader.default_audio_stream).sample_rate
+
+    # TODO(CarolinePascal) : sort timestamps ?
+    reader.add_basic_audio_stream(
+        frames_per_chunk=int(ceil(duration * audio_sample_rate)),  # Too much is better than not enough
+        buffer_chunk_size=-1,  # No dropping frames
+        format="fltp",  # Format as float32
+    )
+
+    audio_chunks = []
+    for ts in timestamps:
+        reader.seek(ts)  # Default to closest audio sample
+        status = reader.fill_buffer()
+        if status != 0:
+            logging.warning("Audio stream reached end of recording before decoding desired timestamps.")
+
+        current_audio_chunk = reader.pop_chunks()[0]
+
+        if log_loaded_timestamps:
+            logging.info(
+                f"audio chunk loaded at starting timestamp={current_audio_chunk['pts']:.4f} with duration={len(current_audio_chunk) / audio_sample_rate:.4f}"
+            )
+
+        audio_chunks.append(current_audio_chunk)
+
+    audio_chunks = torch.stack(audio_chunks)
+
+    assert len(timestamps) == len(audio_chunks)
+    return audio_chunks
+
+
+def encode_audio(
+    input_path: Path | str,
+    output_path: Path | str,
+    codec: str = "aac",  # TODO(CarolinePascal) : investigate Fraunhofer FDK AAC (libfdk_aac) codec and and constant (file size control) /variable (quality control) bitrate options
+    log_level: str | None = "error",
+    overwrite: bool = False,
+) -> None:
+    """Encodes an audio file using ffmpeg."""
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    ffmpeg_args = OrderedDict(
+        [
+            ("-i", str(input_path)),
+            ("-acodec", codec),
+        ]
+    )
+
+    if log_level is not None:
+        ffmpeg_args["-loglevel"] = str(log_level)
+
+    ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
+    if overwrite:
+        ffmpeg_args.append("-y")
+
+    ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(output_path)]
+
+    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
+    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
+
+    if not output_path.exists():
+        raise OSError(
+            f"Audio encoding did not work. File not found: {output_path}. "
+            f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
+        )
+
+
+def get_audio_info(video_path: Path | str) -> dict:
+    ffprobe_audio_cmd = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-select_streams",
+        "a:0",
+        "-show_entries",
+        "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
+        "-of",
+        "json",
+        str(video_path),
+    ]
+    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
+
+    info = json.loads(result.stdout)
+    audio_stream_info = info["streams"][0] if info.get("streams") else None
+    if audio_stream_info is None:
+        return {"has_audio": False}
+
+    # Return the information, defaulting to None if no audio stream is present
+    return {
+        "has_audio": True,
+        "audio.channels": audio_stream_info.get("channels", None),
+        "audio.codec": audio_stream_info.get("codec_name", None),
+        "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
+        "audio.sample_rate": int(audio_stream_info["sample_rate"])
+        if audio_stream_info.get("sample_rate")
+        else None,
+        "audio.bit_depth": audio_stream_info.get("bit_depth", None),
+        "audio.channel_layout": audio_stream_info.get("channel_layout", None),
+    }
diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
index 5e13f9e5..6c7af6a3 100644
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -32,6 +32,11 @@ from huggingface_hub.constants import REPOCARD_NAME
 from huggingface_hub.errors import RevisionNotFoundError
 
 from lerobot.common.constants import HF_LEROBOT_HOME
+from lerobot.common.datasets.audio_utils import (
+    decode_audio,
+    encode_audio,
+    get_audio_info,
+)
 from lerobot.common.datasets.compute_stats import aggregate_stats, compute_episode_stats
 from lerobot.common.datasets.image_writer import AsyncImageWriter, write_image
 from lerobot.common.datasets.utils import (
@@ -70,11 +75,8 @@ from lerobot.common.datasets.utils import (
 )
 from lerobot.common.datasets.video_utils import (
     VideoFrame,
-    decode_audio,
     decode_video_frames,
-    encode_audio,
     encode_video_frames,
-    get_audio_info,
     get_safe_default_codec,
     get_video_info,
 )
@@ -207,29 +209,9 @@ class LeRobotDatasetMetadata:
 
     @property
     def audio_keys(self) -> list[str]:
-        """Keys to access audio modalities (whether they are linked to a camera or not)."""
+        """Keys to access audio modalities."""
         return [key for key, ft in self.features.items() if ft["dtype"] == "audio"]
 
-    @property
-    def audio_camera_keys_mapping(self) -> dict[str, str]:
-        """Mapping between camera keys and audio keys when both are linked."""
-        return {
-            self.features[camera_key]["audio"]: camera_key
-            for camera_key in self.camera_keys
-            if self.features[camera_key]["audio"] is not None
-            and self.features[camera_key]["dtype"] == "video"
-        }
-
-    @property
-    def linked_audio_keys(self) -> list[str]:
-        """Keys to access audio modalities linked to a camera."""
-        return [key for key in self.audio_keys if key in self.audio_camera_keys_mapping]
-
-    @property
-    def unlinked_audio_keys(self) -> list[str]:
-        """Keys to access audio modalities not linked to a camera."""
-        return [key for key in self.audio_keys if key not in self.audio_camera_keys_mapping]
-
     @property
     def names(self) -> dict[str, list | dict]:
         """Names of the various dimensions of vector modalities."""
@@ -310,7 +292,7 @@ class LeRobotDatasetMetadata:
             self.update_video_info()
 
         self.info["total_audio"] += len(self.audio_keys)
-        if len(self.unlinked_audio_keys) > 0:
+        if len(self.audio_keys) > 0:
             self.update_audio_info()
 
         write_info(self.info, self.root)
@@ -342,7 +324,7 @@ class LeRobotDatasetMetadata:
         Warning: this function writes info from first episode audio, implicitly assuming that all audio have
         been encoded the same way. Also, this means it assumes the first episode exists.
         """
-        for key in self.unlinked_audio_keys:
+        for key in self.audio_keys:
             if (
                 not self.features[key].get("info", None)
                 or (len(self.features[key]["info"]) == 1 and "sample_rate" in self.features[key]["info"])
@@ -480,17 +462,31 @@ class LeRobotDataset(torch.utils.data.Dataset):
         │   ├── info.json
         │   ├── stats.json
         │   └── tasks.jsonl
-        └── videos
+        ├── videos
+        │   ├── chunk-000
+        │   │   ├── observation.images.laptop
+        │   │   │   ├── episode_000000.mp4
+        │   │   │   ├── episode_000001.mp4
+        │   │   │   ├── episode_000002.mp4
+        │   │   │   └── ...
+        │   │   ├── observation.images.phone
+        │   │   │   ├── episode_000000.mp4
+        │   │   │   ├── episode_000001.mp4
+        │   │   │   ├── episode_000002.mp4
+        │   │   │   └── ...
+        │   ├── chunk-001
+        │   └── ...
+        └── audio
             ├── chunk-000
-            │   ├── observation.images.laptop
-            │   │   ├── episode_000000.mp4
-            │   │   ├── episode_000001.mp4
-            │   │   ├── episode_000002.mp4
+            │   ├── observation.audio.laptop
+            │   │   ├── episode_000000.m4a
+            │   │   ├── episode_000001.m4a
+            │   │   ├── episode_000002.m4a
             │   │   └── ...
-            │   ├── observation.images.phone
-            │   │   ├── episode_000000.mp4
-            │   │   ├── episode_000001.mp4
-            │   │   ├── episode_000002.mp4
+            │   ├── observation.audio.phone
+            │   │   ├── episode_000000.m4a
+            │   │   ├── episode_000001.m4a
+            │   │   ├── episode_000002.m4a
             │   │   └── ...
             ├── chunk-001
             └── ...
@@ -569,9 +565,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
             self.hf_dataset = self.load_hf_dataset()
         except (AssertionError, FileNotFoundError, NotADirectoryError):
             self.revision = get_safe_version(self.repo_id, self.revision)
-            self.download_episodes(
-                download_videos, download_audio
-            )  # Audio embedded in video files (.mp4) will be downloaded if download_videos is set to True, regardless of the value of download_audio
+            self.download_episodes(download_videos, download_audio)
             self.hf_dataset = self.load_hf_dataset()
 
         self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes)
@@ -582,7 +576,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         ep_data_index_np = {k: t.numpy() for k, t in self.episode_data_index.items()}
         check_timestamps_sync(timestamps, episode_indices, ep_data_index_np, self.fps, self.tolerance_s)
 
-        # TODO(CarolinePascal) : add check for audio duration with respect to video duration and episode duration.
+        # TODO(CarolinePascal) : add check for audio duration with respect to episode duration BUT this will be CPU expensive if there are many episodes !
 
         # Setup delta_indices
         if self.delta_timestamps is not None:
@@ -604,9 +598,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
     ) -> None:
         ignore_patterns = ["images/"]
         if not push_videos:
-            ignore_patterns.append(
-                "videos/"
-            )  # Audio embedded in video files (.mp4) will be automatically pushed if videos are pushed
+            ignore_patterns.append("videos/")
         if not push_audio:
             ignore_patterns.append("audio/")
 
@@ -675,9 +667,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         files = None
         ignore_patterns = []
         if not download_videos:
-            ignore_patterns.append(
-                "videos/"
-            )  # Audio embedded in video files (.mp4) will be automatically downloaded if videos are downloaded
+            ignore_patterns.append("videos/")
         if not download_audio:
             ignore_patterns.append("audio/")
         if self.episodes is not None:
@@ -696,10 +686,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
             ]
             fpaths += video_files
 
-        if len(self.meta.unlinked_audio_keys) > 0:
+        if len(self.meta.audio_keys) > 0:
             audio_files = [
                 str(self.meta.get_compressed_audio_file_path(ep_idx, audio_key))
-                for audio_key in self.meta.unlinked_audio_keys
+                for audio_key in self.meta.audio_keys
                 for ep_idx in episodes
             ]
             fpaths += audio_files
@@ -792,7 +782,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         query_indices: dict[str, list[int]] | None = None,
     ) -> dict[str, list[float]]:
         query_timestamps = {}
-        for key in self.meta.audio_keys:  # Standalone audio and audio embedded in video as well !
+        for key in self.meta.audio_keys:
             if query_indices is not None and key in query_indices:
                 timestamps = self.hf_dataset.select(query_indices[key])["timestamp"]
                 query_timestamps[key] = torch.stack(timestamps).tolist()
@@ -828,14 +818,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
     ) -> dict[str, torch.Tensor]:
         item = {}
         for audio_key, query_ts in query_timestamps.items():
-            # Audio stored with video in a single .mp4 file
-            if audio_key in self.meta.linked_audio_keys:
-                audio_path = self.root / self.meta.get_video_file_path(
-                    ep_idx, self.meta.audio_camera_keys_mapping[audio_key]
-                )
-            # Audio stored alone in a separate .m4a file
-            else:
-                audio_path = self.root / self.meta.get_compressed_audio_file_path(ep_idx, audio_key)
+            audio_path = self.root / self.meta.get_compressed_audio_file_path(ep_idx, audio_key)
             audio_chunk = decode_audio(audio_path, query_ts, query_duration, self.audio_backend)
             item[audio_key] = audio_chunk.squeeze(0)
         return item
@@ -966,7 +949,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
             elif self.features[key]["dtype"] == "audio":
                 if (
                     self.meta.robot_type is not None and self.meta.robot_type.startswith("lekiwi")
-                ):  # Rw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner
+                ):  # Raw data storage should only be triggered for LeKiwi robot, for which audio is stored chunk by chunk in a visual frame-like manner
                     self.episode_buffer[key].append(frame[key])
                 else:  # Otherwise, only the audio file path is stored in the episode buffer
                     if frame_index == 0:
@@ -1062,12 +1045,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
         ep_stats = compute_episode_stats(episode_buffer, self.features)
 
         if len(self.meta.video_keys) > 0:
-            video_paths = self.encode_episode_videos(episode_index)
-            for key in self.meta.video_keys:
-                episode_buffer[key] = video_paths[key]
+            self.encode_episode_videos(episode_index)
 
-        if len(self.meta.unlinked_audio_keys) > 0:  # Linked audio is already encoded in the video files
-            _ = self.encode_episode_audio(episode_index)
+        if len(self.meta.audio_keys) > 0:
+            self.encode_episode_audio(episode_index)
 
         # `meta.save_episode` be executed after encoding the videos
         self.meta.save_episode(episode_index, episode_length, episode_tasks, ep_stats)
@@ -1177,12 +1158,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                 episode_index=episode_index, image_key=video_key, frame_index=0
             ).parent
 
-            audio_path = None
-            if self.meta.features[video_key]["audio"] is not None:
-                audio_key = self.meta.features[video_key]["audio"]
-                audio_path = self._get_raw_audio_file_path(episode_index, audio_key)
-
-            encode_video_frames(img_dir, video_path, self.fps, audio_path=audio_path, overwrite=True)
+            encode_video_frames(img_dir, video_path, self.fps, overwrite=True)
 
         return video_paths
 
@@ -1193,7 +1169,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         since video encoding with ffmpeg is already using multithreading.
         """
         audio_paths = {}
-        for audio_key in self.meta.unlinked_audio_keys:
+        for audio_key in self.meta.audio_keys:
             input_audio_path = self.root / self._get_raw_audio_file_path(episode_index, audio_key)
             output_audio_path = self.root / self.meta.get_compressed_audio_file_path(episode_index, audio_key)
 
diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index c7a5da61..fbf0b48c 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -25,10 +25,8 @@ from typing import Any, ClassVar
 
 import pyarrow as pa
 import torch
-import torchaudio
 import torchvision
 from datasets.features.features import register_feature
-from numpy import ceil
 from PIL import Image
 
 
@@ -42,74 +40,6 @@ def get_safe_default_codec():
         return "pyav"
 
 
-def decode_audio(
-    audio_path: Path | str,
-    timestamps: list[float],
-    duration: float,
-    backend: str | None = "ffmpeg",
-) -> torch.Tensor:
-    """
-    Decodes audio using the specified backend.
-    Args:
-        audio_path (Path): Path to the audio file.
-        timestamps (list[float]): List of (starting) timestamps to extract audio chunks.
-        duration (float): Duration of the audio chunks in seconds.
-        backend (str, optional): Backend to use for decoding. Defaults to "ffmpeg".
-
-    Returns:
-        torch.Tensor: Decoded audio chunks.
-
-    Currently supports ffmpeg.
-    """
-    if backend == "torchcodec":
-        raise NotImplementedError("torchcodec is not yet supported for audio decoding")
-    elif backend == "ffmpeg":
-        return decode_audio_torchaudio(audio_path, timestamps, duration)
-    else:
-        raise ValueError(f"Unsupported video backend: {backend}")
-
-
-def decode_audio_torchaudio(
-    audio_path: Path | str,
-    timestamps: list[float],
-    duration: float,
-    log_loaded_timestamps: bool = False,
-) -> torch.Tensor:
-    # TODO(CarolinePascal) : add channels selection
-    audio_path = str(audio_path)
-
-    reader = torchaudio.io.StreamReader(src=audio_path)
-    audio_sample_rate = reader.get_src_stream_info(reader.default_audio_stream).sample_rate
-
-    # TODO(CarolinePascal) : sort timestamps ?
-    reader.add_basic_audio_stream(
-        frames_per_chunk=int(ceil(duration * audio_sample_rate)),  # Too much is better than not enough
-        buffer_chunk_size=-1,  # No dropping frames
-        format="fltp",  # Format as float32
-    )
-
-    audio_chunks = []
-    for ts in timestamps:
-        reader.seek(ts)  # Default to closest audio sample
-        status = reader.fill_buffer()
-        if status != 0:
-            logging.warning("Audio stream reached end of recording before decoding desired timestamps.")
-
-        current_audio_chunk = reader.pop_chunks()[0]
-
-        if log_loaded_timestamps:
-            logging.info(
-                f"audio chunk loaded at starting timestamp={current_audio_chunk['pts']:.4f} with duration={len(current_audio_chunk) / audio_sample_rate:.4f}"
-            )
-
-        audio_chunks.append(current_audio_chunk)
-
-    audio_chunks = torch.stack(audio_chunks)
-
-    assert len(timestamps) == len(audio_chunks)
-    return audio_chunks
-
-
 def decode_video_frames(
     video_path: Path | str,
     timestamps: list[float],
@@ -313,53 +243,14 @@ def decode_video_frames_torchcodec(
     return closest_frames
 
 
-def encode_audio(
-    input_path: Path | str,
-    output_path: Path | str,
-    codec: str = "aac",  # TODO(CarolinePascal) : investigate Fraunhofer FDK AAC (libfdk_aac) codec and and constant (file size control) /variable (quality control) bitrate options
-    log_level: str | None = "error",
-    overwrite: bool = False,
-) -> None:
-    """Encodes an audio file using ffmpeg."""
-    output_path = Path(output_path)
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-
-    ffmpeg_args = OrderedDict(
-        [
-            ("-i", str(input_path)),
-            ("-acodec", codec),
-        ]
-    )
-
-    if log_level is not None:
-        ffmpeg_args["-loglevel"] = str(log_level)
-
-    ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
-    if overwrite:
-        ffmpeg_args.append("-y")
-
-    ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(output_path)]
-
-    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
-    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
-
-    if not output_path.exists():
-        raise OSError(
-            f"Audio encoding did not work. File not found: {output_path}. "
-            f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
-        )
-
-
 def encode_video_frames(
     imgs_dir: Path | str,
     video_path: Path | str,
     fps: int,
-    audio_path: Path | str | None = None,
     vcodec: str = "libsvtav1",
     pix_fmt: str = "yuv420p",
     g: int | None = 2,
     crf: int | None = 30,
-    acodec: str = "aac",  # TODO(CarolinePascal) : investigate Fraunhofer FDK AAC (libfdk_aac) codec and and constant (file size control) /variable (quality control) bitrate options
     fast_decode: int = 0,
     log_level: str | None = "error",
     overwrite: bool = False,
@@ -377,18 +268,6 @@ def encode_video_frames(
         ]
     )
 
-    ffmpeg_audio_args = OrderedDict()
-    if audio_path is not None:
-        audio_path = Path(audio_path)
-        audio_path.parent.mkdir(parents=True, exist_ok=True)
-        ffmpeg_audio_args.update(
-            OrderedDict(
-                [
-                    ("-i", str(audio_path)),
-                ]
-            )
-        )
-
     ffmpeg_encoding_args = OrderedDict(
         [
             ("-pix_fmt", pix_fmt),
@@ -404,14 +283,10 @@ def encode_video_frames(
         value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
         ffmpeg_encoding_args[key] = value
 
-    if audio_path is not None:
-        ffmpeg_encoding_args["-acodec"] = acodec
-
     if log_level is not None:
         ffmpeg_encoding_args["-loglevel"] = str(log_level)
 
     ffmpeg_args = [item for pair in ffmpeg_video_args.items() for item in pair]
-    ffmpeg_args += [item for pair in ffmpeg_audio_args.items() for item in pair]
     ffmpeg_args += [item for pair in ffmpeg_encoding_args.items() for item in pair]
     if overwrite:
         ffmpeg_args.append("-y")
@@ -460,42 +335,6 @@ with warnings.catch_warnings():
     register_feature(VideoFrame, "VideoFrame")
 
 
-def get_audio_info(video_path: Path | str) -> dict:
-    ffprobe_audio_cmd = [
-        "ffprobe",
-        "-v",
-        "error",
-        "-select_streams",
-        "a:0",
-        "-show_entries",
-        "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
-        "-of",
-        "json",
-        str(video_path),
-    ]
-    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
-
-    info = json.loads(result.stdout)
-    audio_stream_info = info["streams"][0] if info.get("streams") else None
-    if audio_stream_info is None:
-        return {"has_audio": False}
-
-    # Return the information, defaulting to None if no audio stream is present
-    return {
-        "has_audio": True,
-        "audio.channels": audio_stream_info.get("channels", None),
-        "audio.codec": audio_stream_info.get("codec_name", None),
-        "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
-        "audio.sample_rate": int(audio_stream_info["sample_rate"])
-        if audio_stream_info.get("sample_rate")
-        else None,
-        "audio.bit_depth": audio_stream_info.get("bit_depth", None),
-        "audio.channel_layout": audio_stream_info.get("channel_layout", None),
-    }
-
-
 def get_video_info(video_path: Path | str) -> dict:
     ffprobe_video_cmd = [
         "ffprobe",
@@ -531,7 +370,6 @@ def get_video_info(video_path: Path | str) -> dict:
         "video.codec": video_stream_info["codec_name"],
         "video.pix_fmt": video_stream_info["pix_fmt"],
         "video.is_depth_map": False,
-        **get_audio_info(video_path),
     }
 
     return video_info
diff --git a/lerobot/common/robot_devices/cameras/configs.py b/lerobot/common/robot_devices/cameras/configs.py
index b1bb588c..013419a9 100644
--- a/lerobot/common/robot_devices/cameras/configs.py
+++ b/lerobot/common/robot_devices/cameras/configs.py
@@ -48,8 +48,6 @@ class OpenCVCameraConfig(CameraConfig):
     rotation: int | None = None
     mock: bool = False
 
-    microphone: str | None = None
-
     def __post_init__(self):
         if self.color_mode not in ["rgb", "bgr"]:
             raise ValueError(
diff --git a/lerobot/common/robot_devices/cameras/intelrealsense.py b/lerobot/common/robot_devices/cameras/intelrealsense.py
index ac0e8ac7..7a21661a 100644
--- a/lerobot/common/robot_devices/cameras/intelrealsense.py
+++ b/lerobot/common/robot_devices/cameras/intelrealsense.py
@@ -265,8 +265,6 @@ class IntelRealSenseCamera:
         elif config.rotation == 180:
             self.rotation = cv2.ROTATE_180
 
-        self.microphone = None  # No microphones on realsense cameras, sorry
-
     def find_serial_number_from_name(self, name):
         camera_infos = find_cameras()
         camera_names = [cam["name"] for cam in camera_infos]
diff --git a/lerobot/common/robot_devices/cameras/opencv.py b/lerobot/common/robot_devices/cameras/opencv.py
index 757b3d9f..f279f315 100644
--- a/lerobot/common/robot_devices/cameras/opencv.py
+++ b/lerobot/common/robot_devices/cameras/opencv.py
@@ -281,8 +281,6 @@ class OpenCVCamera:
         elif config.rotation == 180:
             self.rotation = cv2.ROTATE_180
 
-        self.microphone = config.microphone
-
     def connect(self):
         if self.is_connected:
             raise RobotDeviceAlreadyConnectedError(f"OpenCVCamera({self.camera_index}) is already connected.")
diff --git a/lerobot/common/robot_devices/robots/configs.py b/lerobot/common/robot_devices/robots/configs.py
index 942586a0..ab362ad1 100644
--- a/lerobot/common/robot_devices/robots/configs.py
+++ b/lerobot/common/robot_devices/robots/configs.py
@@ -486,7 +486,6 @@ class So100RobotConfig(ManipulatorRobotConfig):
                 fps=30,
                 width=640,
                 height=480,
-                microphone="laptop",
             ),
             "phone": OpenCVCameraConfig(
                 camera_index=1,
diff --git a/lerobot/common/robot_devices/robots/manipulator.py b/lerobot/common/robot_devices/robots/manipulator.py
index b452be9d..00bcd3db 100644
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@@ -181,7 +181,6 @@ class ManipulatorRobot:
                 "shape": (cam.height, cam.width, cam.channels),
                 "names": ["height", "width", "channels"],
                 "info": None,
-                "audio": "observation.audio." + cam.microphone if cam.microphone is not None else None,
             }
         return cam_ft
 
@@ -211,7 +210,9 @@ class ManipulatorRobot:
                 "dtype": "audio",
                 "shape": (len(mic.channels),),
                 "names": "channels",
-                "info": {"sample_rate": mic.sample_rate},
+                "info": {
+                    "sample_rate": mic.sample_rate
+                },  # we need to store the sample rate here in the case of audio chunks recording (for LeKiwi), as it will not be available anymore when writing the audio file
             }
         return mic_ft