lerobot/lerobot/common/robot_devices/microphones/microphone.py

367 lines
14 KiB
Python

# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file contains utilities for recording audio from a microhone.
"""
import argparse
import soundfile as sf
import numpy as np
import logging
from threading import Thread, Event
from multiprocessing import Process
from queue import Empty
from queue import Queue as thread_Queue
from threading import Event as thread_Event
from multiprocessing import JoinableQueue as process_Queue
from multiprocessing import Event as process_Event
from os import getcwd
from pathlib import Path
import shutil
import time
from lerobot.common.utils.utils import capture_timestamp_utc
from lerobot.common.robot_devices.microphones.configs import MicrophoneConfig
from lerobot.common.robot_devices.utils import (
RobotDeviceAlreadyConnectedError,
RobotDeviceNotConnectedError,
RobotDeviceNotRecordingError,
RobotDeviceAlreadyRecordingError,
)
def find_microphones(raise_when_empty=False, mock=False) -> list[dict]:
microphones = []
if mock:
import tests.microphones.mock_sounddevice as sd
else:
import sounddevice as sd
devices = sd.query_devices()
for device in devices:
if device["max_input_channels"] > 0:
microphones.append(
{
"index": device["index"],
"name": device["name"],
}
)
if raise_when_empty and len(microphones) == 0:
raise OSError(
"Not a single microphone was detected. Try re-plugging the microphone or check the microphone settings."
)
return microphones
def record_audio_from_microphones(
output_dir: Path,
microphone_ids: list[int] | None = None,
record_time_s: float = 2.0):
if microphone_ids is None or len(microphone_ids) == 0:
microphones = find_microphones()
microphone_ids = [m["index"] for m in microphones]
microphones = []
for microphone_id in microphone_ids:
config = MicrophoneConfig(microphone_index=microphone_id)
microphone = Microphone(config)
microphone.connect()
print(
f"Recording audio from microphone {microphone_id} for {record_time_s} seconds at {microphone.sample_rate} Hz."
)
microphones.append(microphone)
output_dir = Path(output_dir)
if output_dir.exists():
shutil.rmtree(
output_dir,
)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Saving audio to {output_dir}")
for microphone in microphones:
microphone.start_recording(getcwd() / output_dir / f"microphone_{microphone.microphone_index}.wav")
time.sleep(record_time_s)
for microphone in microphones:
microphone.stop_recording()
#Remark : recording may be resumed here if needed
for microphone in microphones:
microphone.disconnect()
print(f"Images have been saved to {output_dir}")
class Microphone:
"""
The Microphone class handles all microphones compatible with sounddevice (and the underlying PortAudio library). Most microphones and sound cards are compatible, accross all OS (Linux, Mac, Windows).
A Microphone instance requires the sounddevice index of the microphone, which may be obtained using `python -m sounddevice`. It also requires the recording sample rate as well as the list of recorded channels.
Example of usage:
```python
from lerobot.common.robot_devices.microphones.configs import MicrophoneConfig
config = MicrophoneConfig(microphone_index=0, sample_rate=16000, channels=[1])
microphone = Microphone(config)
microphone.connect()
microphone.start_recording("some/output/file.wav")
...
audio_readings = microphone.read() #Gets all recorded audio data since the last read or since the beginning of the recording
...
microphone.stop_recording()
microphone.disconnect()
```
"""
def __init__(self, config: MicrophoneConfig):
self.config = config
self.microphone_index = config.microphone_index
#Store the recording sample rate and channels
self.sample_rate = config.sample_rate
self.channels = config.channels
self.mock = config.mock
#Input audio stream
self.stream = None
#Thread-safe concurrent queue to store the recorded/read audio
self.record_queue = None
self.read_queue = None
#Thread to handle data reading and file writing in a separate thread (safely)
self.record_thread = None
self.record_stop_event = None
self.logs = {}
self.is_connected = False
self.is_recording = False
def connect(self) -> None:
if self.is_connected:
raise RobotDeviceAlreadyConnectedError(f"Microphone {self.microphone_index} is already connected.")
if self.mock:
import tests.microphones.mock_sounddevice as sd
else:
import sounddevice as sd
#Check if the provided microphone index does match an input device
is_index_input = sd.query_devices(self.microphone_index)["max_input_channels"] > 0
if not is_index_input:
microphones_info = find_microphones()
available_microphones = [m["index"] for m in microphones_info]
raise OSError(
f"Microphone index {self.microphone_index} does not match an input device (microphone). Available input devices : {available_microphones}"
)
#Check if provided recording parameters are compatible with the microphone
actual_microphone = sd.query_devices(self.microphone_index)
if self.sample_rate is not None :
if self.sample_rate > actual_microphone["default_samplerate"]:
raise OSError(
f"Provided sample rate {self.sample_rate} is higher than the sample rate of the microphone {actual_microphone['default_samplerate']}."
)
elif self.sample_rate < actual_microphone["default_samplerate"]:
logging.warning("Provided sample rate is lower than the sample rate of the microphone. Performance may be impacted.")
else:
self.sample_rate = int(actual_microphone["default_samplerate"])
if self.channels is not None:
if any(c > actual_microphone["max_input_channels"] for c in self.channels):
raise OSError(
f"Some of the provided channels {self.channels} are outside the maximum channel range of the microphone {actual_microphone['max_input_channels']}."
)
else:
self.channels = np.arange(1, actual_microphone["max_input_channels"]+1)
# Get channels index instead of number for slicing
self.channels = np.array(self.channels) - 1
#Create the audio stream
self.stream = sd.InputStream(
device=self.microphone_index,
samplerate=self.sample_rate,
channels=max(self.channels)+1,
dtype="float32",
callback=self._audio_callback,
)
#Remark : the blocksize parameter could be passed to the stream to ensure that audio_callback always recieve same length buffers.
#However, this may lead to additionnal latency. We thus stick to blocksize=0 which means that audio_callback will recieve varying length buffers, but with no addtional latency.
self.is_connected = True
def _audio_callback(self, indata, frames, time, status) -> None :
if status:
logging.warning(status)
# Slicing makes copy unecessary
# Two separate queues are necessary because .get() also pops the data from the queue
self.record_queue.put(indata[:,self.channels])
self.read_queue.put(indata[:,self.channels])
@staticmethod
def _record_loop(queue, event: Event, sample_rate: int, channels: list[int], output_file: Path) -> None:
#Can only be run on a single process/thread for file writing safety
with sf.SoundFile(output_file, mode='x', samplerate=sample_rate,
channels=max(channels)+1, subtype=sf.default_subtype(output_file.suffix[1:])) as file:
while not event.is_set():
try:
file.write(queue.get(timeout=0.02)) #Timeout set as twice the usual sounddevice buffer size
queue.task_done()
except Empty:
continue
def _read(self) -> np.ndarray:
"""
Gets audio data from the queue and coverts it to a numpy array.
-> PROS : Inherently thread safe, no need to lock the queue, lightweight CPU usage
-> CONS : Reading duration does not scale well with the number of channels and reading duration
"""
audio_readings = np.empty((0, len(self.channels)))
while True:
try:
audio_readings = np.concatenate((audio_readings, self.read_queue.get_nowait()), axis=0)
except Empty:
break
self.read_queue = thread_Queue()
return audio_readings
def read(self) -> np.ndarray:
if not self.is_connected:
raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
if not self.is_recording:
raise RobotDeviceNotRecordingError(f"Microphone {self.microphone_index} is not recording.")
start_time = time.perf_counter()
audio_readings = self._read()
# log the number of seconds it took to read the audio chunk
self.logs["delta_timestamp_s"] = time.perf_counter() - start_time
# log the utc time at which the audio chunk was received
self.logs["timestamp_utc"] = capture_timestamp_utc()
return audio_readings
def start_recording(self, output_file : str | None = None, multiprocessing : bool | None = False) -> None:
if not self.is_connected:
raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
if self.is_recording:
raise RobotDeviceAlreadyRecordingError(f"Microphone {self.microphone_index} is already recording.")
#Reset queues
self.read_queue = thread_Queue()
if multiprocessing:
self.record_queue = process_Queue()
else:
self.record_queue = thread_Queue()
#Write recordings into a file if output_file is provided
if output_file is not None:
output_file = Path(output_file)
if output_file.exists():
output_file.unlink()
if multiprocessing:
self.record_stop_event = process_Event()
self.record_thread = Process(target=Microphone._record_loop, args=(self.record_queue, self.record_stop_event, self.sample_rate, self.channels, output_file, ))
else:
self.record_stop_event = thread_Event()
self.record_thread = Thread(target=Microphone._record_loop, args=(self.record_queue, self.record_stop_event, self.sample_rate, self.channels, output_file, ))
self.record_thread.daemon = True
self.record_thread.start()
self.is_recording = True
self.stream.start()
def stop_recording(self) -> None:
if not self.is_connected:
raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
if not self.is_recording:
raise RobotDeviceNotRecordingError(f"Microphone {self.microphone_index} is not recording.")
if self.stream.active:
self.stream.stop() #Wait for all buffers to be processed
#Remark : stream.abort() flushes the buffers !
self.is_recording = False
if self.record_thread is not None:
self.record_queue.join()
self.record_stop_event.set()
self.record_thread.join()
self.record_thread = None
self.record_stop_event = None
self.is_writing = False
def disconnect(self) -> None:
if not self.is_connected:
raise RobotDeviceNotConnectedError(f"Microphone {self.microphone_index} is not connected.")
if self.is_recording:
self.stop_recording()
self.stream.close()
self.is_connected = False
def __del__(self):
if getattr(self, "is_connected", False):
self.disconnect()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Records audio using `Microphone` for all microphones connected to the computer, or a selected subset."
)
parser.add_argument(
"--microphone-ids",
type=int,
nargs="*",
default=None,
help="List of microphones indices used to instantiate the `Microphone`. If not provided, find and use all available microphones indices.",
)
parser.add_argument(
"--output-dir",
type=Path,
default="outputs/audio_from_microphones",
help="Set directory to save an audio snipet for each microphone.",
)
parser.add_argument(
"--record-time-s",
type=float,
default=4.0,
help="Set the number of seconds used to record the audio. By default, 4 seconds.",
)
args = parser.parse_args()
record_audio_from_microphones(**vars(args))